-
Notifications
You must be signed in to change notification settings - Fork 0
/
marcie.sh
193 lines (151 loc) · 7.61 KB
/
marcie.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/bin/bash
#exit immediately if a command returns non-zero status
set -eo pipefail
#Check that required software is in path
for software in bwa gatk sambamba picard samtools; do
if ! command -v $software &> /dev/null; then
echo -e "### $software could not be found ###\n"
return 127
else
echo -e "\n### $software is on path ###\n"
fi
done
#Parse input variables from command line
#genome
genome=$1
echo -e "\n### Genome file provided: $genome"
#reads
READS_DIR=$2
echo -e "\n### Directory with NGS reads (Illumina) for the different samples: $READS_DIR"
#species_name
species_name=$3
echo -e "### Species name for RG string: $species_name"
#date
current_date=$(date +%F)
echo -e "### Date for RG string: $current_date"
#Threads for mapping and processing of BAM files
threads=$4
echo -e "### Number of threads to use: $threads"
#Minimum mapping quality for the reads to keep (lower quality mappings are filtered out)
MQ=$5
echo -e "### Minimum mapping quality to filter BAM (MQ): $MQ"
echo -e "\n### Sample IDs (will be used for RG string in the BAM):"
ls -1 $READS_DIR/*fastq.gz | xargs -n1 basename | cut -f1 -d"." | sort -k1 -n | uniq > list_of_SeqIDs.txt
cat list_of_SeqIDs.txt
############################################################################
#Step 1: Read mapping with BWA and processing of BAM files for variant calling
echo -e "\n### Step 1: Read mapping with BWA and processing of BAM files for variant calling... ##\n"
###################################################################################################
#perform mapping of reads and filtering of bam files
cat list_of_SeqIDs.txt| while read SeqID;do
mappingID=$(sed -e "s/.fasta//" <<< $genome).$SeqID
#check if mapping files with marked duplicates already exist
if [[ -f $mappingID.sorted.MQ$MQ.mdup.bam ]];then
echo -e "### Mapping files for $SeqID exist... ###\n"
continue
else
###Perform mapping and processing of BAM files for variant calling)
#checks for reads files
echo -e "\n### Working on sample $SeqID###\n"
FILE1="$SeqID"".R1.fastq.gz"
echo -e "\n###fastq R1: $FILE1###"
#check if file1 exists
[[ -f "$READS_DIR/$FILE1" ]] && echo -e "### $FILE1 exists in reads directory."
FILE2="$SeqID"".R2.fastq.gz"
echo -e "\n###fastq R2: $FILE2###"
#check if file 2 exists
[[ -f "$READS_DIR/$FILE2" ]] && echo -e "### $FILE2 exists in reads directory.\n"
#Create RGstring
RGstring="@RG\tID:$SeqID-illumina_reads\tDT:$current_date\tLB:$SeqID-illumina_reads\tPL:ILLUMINA\tSM:$species_name""_ref_$SeqID-illumina_reads"
echo -e "\n###RGstring:$RGstring"
#indexing genome for BWA
echo -e "\n### Indexing genome reference: ###\n\t$genome\n"
bwa index $genome
#Mapping reads and create mapping files (option -M for picard compatibility)
echo -e "\n### Mapping Reads onto genome reference\ngenome:\t$genome\nfq1:\t$READS_DIR/$FILE1\nfq2:\t$READS_DIR/$FILE2\n"
bwa mem -t $threads -M -R $RGstring $genome $READS_DIR/$FILE1 $READS_DIR/$FILE2 -o $mappingID.sam
#Post-treating mapping files (SAM->BAM, sort, filter, mark duplicates)
echo -e "\n### Post-treating mapping files\nmappingID:\t$mappingID\nSAM->BAM and sorting...\n"
sambamba view -t $threads -f bam -h -S -o $mappingID.bam $mappingID.sam
echo -e "\n### Deleting SAM file ...###\n"
rm -f $mappingID.sam
echo -e "\n### Sorting BAM file ...###\n"
sambamba sort -t $threads --tmpdir=./ $mappingID.bam -o $mappingID.sorted.bam
echo -e "\n### Deleting unsorted BAM file ...###\n"
rm -f $mappingID.bam
#filter reads with mapping quality <10
echo -e "\n### Filtering reads with mapping quality < 10\nmappingID:\t$mappingID\nMinimal Mapping Quality:\t$MQ"
sambamba view -t $threads -f bam -h -F "mapping_quality >= $MQ" $mappingID.sorted.bam -o $mappingID.sorted.MQ$MQ.bam
#mark duplicates (necessary for variant calling)
echo -e "\n### Mark duplicates with Picard\nmappingID:\t$mappingID\n"
time picard MarkDuplicates -I $mappingID.sorted.MQ$MQ.bam -O $mappingID.sorted.MQ$MQ.mdup.bam -M marked_dup_metrics.$mappingID.sorted.MQ$MQ.txt
#remove temporary files
echo -e "\n### Remove temporary files (Deleting sorted filtered BAM file before quality-based filtering step) ...###\n"
rm -f $mappingID.sorted.MQ$MQ.bam
rm -f $mappingID.sorted.MQ$MQ.bam.bai
rm -f $mappingID.sorted.bam
rm -f $mappingID.sorted.bam.bai
#index final bam file
echo -e "\n### Index mapping file (BAM file) \nmappingID:\t$mappingID:\n"
sambamba index -t $threads $mappingID.sorted.MQ$MQ.mdup.bam
fi
done
###############################
#Variant calling (step 2)
echo -e "\n### Step 2: Short germline variant calling with HaplotypeCaller... ##\n"
#####################################################################
cat list_of_SeqIDs.txt| while read SeqID;do
mappingID=$(basename $genome .fasta).$SeqID
#input bam for variant calling
input=$mappingID.sorted.MQ$MQ.mdup.bam
if [[ -f $input.g.vcf ]];then
echo -e "### GVCF of sample $SeqID ($input) exists... ###\n"
continue
else
echo -e "\n### Input BAM $mappingID.sorted.MQ$MQ.mdup.bam...\n"
#Genome dictionary and indexing
echo -e "\n### Create genome dictionary with picard...\ngenome:\t$genome\n"
ref_genome_dict=$(basename $genome .fasta).dict
if [ ! -f "$ref_genome_dict" ];then
time picard CreateSequenceDictionary -R $genome -O $ref_genome_dict
else
echo -e "### Genome dictionary found in current directory... ###\n"
fi
#Genome indexing with samtools
echo -e "\n### Index genome with SAMTOOLS...\ngenome:\t$genome\nbam file:\t$input"
samtools faidx $genome
#Call short germline variants (SNPs and indels) using HaplotypeCaller
echo -e "\n### SNP-Calling with GATK \ngenome:\t$genome\nbam file:\t$input"
time gatk HaplotypeCaller -R $genome -I $input -ERC GVCF -O $input.g.vcf
fi
done
############################################################
#STEP 3: COMBINE GVCFS
echo -e "\n### Step 3: Combine GVCFs with \"gatk CombineGVCFs\"... ###\n"
#############################################################################
list_GVCFs=$species_name".MQ"$MQ".mdup.bam.g.vcf.list"
combined_gvcfs=$species_name".combined.g.vcf"
if [[ -f $combined_gvcfs ]]; then
echo -e "\n### File with combined GVCFs already exists... ###\n"
else
echo -e "\n### List of GVCFs: $list_GVCFs ###\n"
#WARNING: IF the file has a different ending than *list the CombineGVCFs command gives an error
ls -1 *g.vcf > $list_GVCFs
fi
#Combine gvcfs
time gatk CombineGVCFs -R $genome -V $list_GVCFs -O $combined_gvcfs
###############################################################################
## STEP 4: GENOTYPING COMBINED GVCFS - GATK ##################
echo -e "\n### Step 4: Genotyping combined GVCFs with \"gatk GenotypeGVCFs\"... ###\n"
###########################################################################################
#make output directory for placing the final combined file with all genotypes from diffrent samples
outdir=joint_vcf_raw.$genome
raw_vcf="$species_name.raw.vcf"
mkdir -p $outdir
if [[ -f "$outdir/$raw_vcf" ]]; then
echo -e "\n### Combined file with genotyped GVCFs already exists... ###\n"
else
time gatk GenotypeGVCFs -R $genome -V $species_name.combined.g.vcf -O $outdir/$raw_vcf
fi
#Pipeline finished
echo -e "\n### Variant calling pipeline finished succesfully... ###\n"