其他
肿瘤外显子数据处理系列教程(三)比对
因为周一专栏处于6月交接期间,所以大家仍然是可以在头条教程看到新的学徒的数据挖掘教程,然后我这个肿瘤外显子数据处理系列会一直在次页慢慢更新,直到离开为止,非常谢谢大家关注了我一年!
接下来走的是bwa,gatk的流程。
GATK
## GATK需要JDK8
sudo apt remove openjdk*
sudo apt update
sudo apt install openjdk-8-jre
java -version
## openjdk version "1.8.0_212"
## https://software.broadinstitute.org/gatk/download/
BWA
## github上是最新版的
git clone https://github.com/lh3/bwa.git
cd bwa
make
ln -s ~/tools/bwa/bwa ../bin/bwa
SAMTOOLS
wget https://jaist.dl.sourceforge.net/project/samtools/samtools/1.9/samtools-1.9.tar.bz2
tar -jxvf samtools-1.9.tar.bz2
mkdir samtools
cd samtools-1.9
./configure --prefix=/home/llwu/tools/samtools
ls /home/llwu/tools/samtools/bin/ | while read id
do
ln -s /home/llwu/tools/samtools/bin/${id} $id
done
## 下载人类参考基因组
## 从gatk上下载
ftp ftp.broadinstitute.org
gsapubftp-anonymous
cd bundle
ls
broad 890202736 Jan 5 2016 Homo_sapiens_assembly38.fasta.gz
broad 20685880 Jan 5 2016 Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
broad 1500013 Jan 5 2016 Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi
broad 3411143311 Dec 1 2016 dbsnp_146.hg38.vcf.gz
broad 2466606 Dec 1 2016 dbsnp_146.hg38.vcf.gz.tbi
## 下载这五个文件
samtools faidx GRCh38.fa
## index
bwa index -a bwtsw -p GRCh38 /home/llwu/reference/fasta/human/GRCh38.fa
## dic
gatk CreateSequenceDictionary
-R /home/llwu/reference/fasta/human/GRCh38.fa
-O /home/llwu/reference/fasta/human/GRCh38.dict
## bwa.sh
cat ../case |while read id
do
fq1=${id}_1_val_1.fq.gz
fq2=${id}_1_val_1.fq.gz
echo $fq1
echo $fq2
bwa mem -M -t 11 -R "@RG ID:$id SM:$id LB:WXS PL:Illumina" /home/llwu/reference/fasta/human/GRCh38 $fq1 $fq2 | samtools sort -@ 11 -m 1G --reference /home/llwu/reference/fasta/human/GRCh38.fa -o ../4.align/${id}.bam -
done
nohup bash bwa.sh &
6.9G 5月 25 17:47 case1_biorep_A_techrep.bam
5.9G 5月 25 18:09 case1_biorep_B.bam
7.1G 5月 25 18:33 case1_biorep_C.bam
2.9G 5月 25 14:29 case1_germline.bam
5.9G 5月 25 21:38 case1_techrep_2.bam
5.3G 5月 25 18:53 case2_biorep_A.bam
5.8G 5月 25 19:14 case2_biorep_B_techrep.bam
5.3G 5月 25 19:33 case2_biorep_C.bam
1.9G 5月 25 13:46 case2_germline.bam
5.2G 5月 25 21:56 case2_techrep_2.bam
7.1G 5月 25 16:08 case3_biorep_A.bam
6.8G 5月 25 16:33 case3_biorep_B.bam
3.8G 5月 25 16:46 case3_biorep_C_techrep.bam
2.9G 5月 25 14:07 case3_germline.bam
5.2G 5月 25 20:54 case3_techrep_2.bam
6.9G 5月 25 14:54 case4_biorep_A.bam
7.1G 5月 25 15:18 case4_biorep_B_techrep.bam
6.8G 5月 25 15:42 case4_biorep_C.bam
2.9G 5月 25 13:57 case4_germline.bam
6.8G 5月 25 20:36 case4_techrep_2.bam
5.5G 5月 25 19:53 case5_biorep_A.bam
5.4G 5月 25 20:12 case5_biorep_B_techrep.bam
2.4G 5月 25 22:27 case5_biorep_C.bam
4.3G 5月 25 13:39 case5_germline.bam
6.3G 5月 25 22:19 case5_techrep_2.bam
3.0G 5月 25 16:56 case6_biorep_A_techrep.bam
3.7G 5月 25 17:09 case6_biorep_B.bam
3.8G 5月 25 17:22 case6_biorep_C.bam
2.9G 5月 25 14:18 case6_germline.bam
5.9G 5月 25 21:16 case6_techrep_2.bam
wget https://jaist.dl.sourceforge.net/project/gnuplot/gnuplot/5.2.6/gnuplot-5.2.6.tar.gz
tar -zxvf gnuplot-5.2.6.tar.gz
./configure --prefix=/home/llwu/tools/gnuplot
make
make install
## stat.sh
ls *bam | while read id
do
samtools stats -@ 16 --reference /home/llwu/reference/index/human/bwa/hg38.fa ${id} > ./stat/$(basename ${id} .bam).stat
done
ls ../*stat | while read id
do
plot-bamstats -p $(basename ${id} .stat) ${id}
done
wget ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.current.txt
cat CCDS.current.txt |grep "Public" | perl -alne '{/[(.*?)]/;next unless $1;$gene=$F[2];$exons=$1;$exons=~s/s//g;$exons=~s/-/ /g;print "$F[0] $_ $gene" foreach split/,/,$exons;}' | sort -u | bedtools sort -i |awk '{print "chr"$0" 0 +"}' > hg38.exon.bed
## qualimap.sh
ls *bam | while read id
do
file=$(basename ${id} .bam)
qualimap bamqc --java-mem-size=40G -gff /home/llwu/reference/index/human/CCDS/hg38.exon.bed -nr 100000 -nw 500 -nt 16 -bam $id -outdir ./qualimap/${file}
done
nohup bash qualimap.sh &
■ ■ ■