Skip to content

Commit

Permalink
Merge pull request #51 from dhslab/main
Browse files Browse the repository at this point in the history
trashed freebayes and updated variant filtering
  • Loading branch information
dufeiyu authored Jun 16, 2022
2 parents 76d0a4d + af358c0 commit 969a877
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 62 deletions.
53 changes: 2 additions & 51 deletions MyeloseqHDAnalysis.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,6 @@ workflow MyeloseqHDAnalysis {
jobGroup=JobGroup
}

call run_freebayes {
input: Cram=convert_bam.cram,
CramIndex=convert_bam.crai,
CoverageBed=CoverageBed,
refFasta=refFasta,
Name=Name,
queue=Queue,
jobGroup=JobGroup
}

call clean_variants as clean_freebayes {
input: Vcf=run_freebayes.vcf,
Name=Name,
refFasta=refFasta,
queue=Queue,
jobGroup=JobGroup
}

call run_pindel_region as run_pindel_flt3itd {
input: Cram=convert_bam.cram,
CramIndex=convert_bam.crai,
Expand All @@ -110,7 +92,7 @@ workflow MyeloseqHDAnalysis {
}

call combine_variants {
input: Vcfs=select_all([DragenVcf,clean_freebayes.cleaned_vcf_file,clean_pindel_itd.cleaned_vcf_file,clean_queryDB_vcf.cleaned_vcf_file,GenotypeVcf]),
input: Vcfs=select_all([DragenVcf,clean_pindel_itd.cleaned_vcf_file,clean_queryDB_vcf.cleaned_vcf_file,GenotypeVcf]),
Cram=convert_bam.cram,
CramIndex=convert_bam.crai,
refFasta=refFasta,
Expand Down Expand Up @@ -231,37 +213,6 @@ task convert_bam {
}
}

task run_freebayes {
File Cram
File CramIndex
String CoverageBed
String refFasta
String Name
Float? MinFreq
Int? MinReads
Int? MinMapQual
Int? MaxMismatch

String jobGroup
String queue

command {
/usr/local/bin/freebayes -C ${default=3 MinReads} -q ${default=13 MinMapQual} -F ${default="0.0008" MinFreq} -$ ${default=4 MaxMismatch} \
-f ${refFasta} -t ${CoverageBed} -K ${Cram} > "${Name}.freebayes.vcf"
}

runtime {
docker_image: "registry.gsc.wustl.edu/mgi-cle/myeloseqhd:v1"
cpu: "1"
memory: "16 G"
queue: queue
job_group: jobGroup
}
output {
File vcf = "${Name}.freebayes.vcf"
}
}

task run_pindel_region {
File Cram
File CramIndex
Expand Down Expand Up @@ -359,7 +310,7 @@ task combine_variants {
command {
/usr/local/bin/bcftools merge --force-samples -Oz ${sep=" " Vcfs} | /usr/local/bin/bcftools sort -Oz -o combined.vcf.gz && \
/usr/bin/tabix -p vcf combined.vcf.gz && \
/usr/bin/python3 /usr/local/bin/filterHaloplex.py -r ${refFasta} --minreadsperfamily ${default='3' MinReadsPerFamily} -m ${default='3' Reads} -d ${default='0.02' Vaf} combined.vcf.gz ${Cram} ${Name} > ${Name}.combined_and_tagged.vcf
/usr/bin/python3 /usr/local/bin/filterHaloplex.py -r ${refFasta} --minreadsperfamily ${default='3' MinReadsPerFamily} -m ${default='5' Reads} -d ${default='0.02' Vaf} combined.vcf.gz ${Cram} ${Name} > ${Name}.combined_and_tagged.vcf
}

runtime {
Expand Down
27 changes: 16 additions & 11 deletions dockerfiles/docker-myeloseq/filterHaloplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def revcomp(seq):
parser.add_argument('-q',"--minmapqual",type=int,default=1,
help='Minium read mapping quality to contribute to variant calling')

parser.add_argument('-m',"--minreads",type=int,default=3,
parser.add_argument('-m',"--minreads",type=int,default=5,
help='minimum reads supporting a variant to be reported')

parser.add_argument('-v',"--minreads4vaf",type=int,default=3,
Expand Down Expand Up @@ -207,8 +207,8 @@ def revcomp(seq):

numvars = 0

for vline in vcffile.fetch(reopen=True):

for vline in vcffile.fetch(reopen=True):
numvars += 1

# first determine the callers that identified the variant
Expand All @@ -222,7 +222,10 @@ def revcomp(seq):
if 'HOMLEN' in vline.info.keys():
callers.append('pindel')


# must be a passing variant, a genotyping variant, or in the database to proceed
if 'MyeloSeqHDForceGT' not in vline.info.keys() and 'MyeloSeqHDDB' not in vline.info.keys() and 'PASS' not in vline.filter.keys():
continue

# handle multiallelic entries in the VCF (and split them)
for alt in vline.alts:

Expand Down Expand Up @@ -449,7 +452,7 @@ def revcomp(seq):
failingreadvaf = readqual2x2[1][1]/(readqual2x2[1][1]+readqual2x2[0][1]) if (readqual2x2[1][1]+readqual2x2[0][1]) > 0 else 0
failedreadbias = 0
failedreadbiasstr = str(readqual2x2[0][0]) + "," + str(readqual2x2[0][1]) + ',' + str(readqual2x2[1][0]) + "," + str(readqual2x2[1][1]) + ","
if 'GT' in rec.format.keys() and rec.samples[0]['GT'] != (1,1) and readdat[(readdat["calls"]=='ref')].shape[0] > 0 and readdat[(readdat["calls"]=='alt')].shape[0] > 0 and readdat[(readdat["readquals"]=='pass')].shape[0] > 0 and readdat[(readdat["readquals"]=='fail')].shape[0] > 0 and failingreadvaf > passingreadvaf:
if 'GT' in rec.format.keys() and rec.samples[0]['GT'] != (1,1) and readdat[(readdat["calls"]=='ref')].shape[0] > 0 and readdat[(readdat["calls"]=='alt')].shape[0] > 0 and readdat[(readdat["readquals"]=='pass')].shape[0] > 0 and readdat[(readdat["readquals"]=='fail')].shape[0] > 0 and failingreadvaf > passingreadvaf and readqual2x2[1][1]/(readqual2x2[1][1]+readqual2x2[1][0]) > 0.2:
failedreadbias = min(99,int(-10 * math.log(fisher_exact(readqual2x2)[1] or 1.258925e-10,10)))

failedreadbiasstr = failedreadbiasstr + str(failedreadbias)
Expand Down Expand Up @@ -479,9 +482,11 @@ def revcomp(seq):
if 'GT' in rec.format.keys() and rec.samples[0]['GT'] != (1,1) and rawvaf < .9 and altstrandskewing * refstrandskewing < 0 and readdat[(readdat["calls"]=='ref')].shape[0] > 0 and readdat[(readdat["calls"]=='alt')].shape[0] > 0 and readdat[(readdat["strands"]=='+')].shape[0] > 0 and readdat[(readdat["strands"]=='-')].shape[0] > 0:
sb = min(min(99,int(-10 * math.log(binom.pmf(plusaltreads, plusaltreads+minusaltreads,plusrefreads/(plusrefreads+minusrefreads)) or 1.258925e-10,10))),min(99,int(-10 * math.log(binom.pmf(plusaltreads, plusaltreads+minusaltreads,minusrefreads/(plusrefreads+minusrefreads)) or 1.258925e-10,10))))

# do filtering
nrec = vcffile.new_record()
if len(supportingamplicons) < minampnumber and ao > 0:

# do filtering of all variants NOT called by DRAGEN--those we just accept 'PASS' as good enough

if 'dragen' not in callers and len(supportingamplicons) < minampnumber and ao > 0:
nrec.filter.add("AMPSupport")

# min vaf filter
Expand All @@ -490,16 +495,16 @@ def revcomp(seq):

# if there are < minstrandreads alt-supporting reads, then calculate the binomial p-value
# for that observation given the overall VAF and the strand-specific read depth
if ao > 0 and ((plusaltreads+plusrefreads > 0 and plusaltreads < minstrandreads and binom.cdf(minstrandreads, plusaltreads+plusrefreads,rawvaf, loc=0) < strandpvalue) or (minusaltreads+minusrefreads > 0 and minusaltreads < minstrandreads and binom.cdf(minstrandreads, minusaltreads+minusrefreads,rawvaf, loc=0) < strandpvalue)):
if 'dragen' not in callers and ao > 0 and ((plusaltreads+plusrefreads > 0 and plusaltreads < minstrandreads and binom.cdf(minstrandreads, plusaltreads+plusrefreads,rawvaf, loc=0) < strandpvalue) or (minusaltreads+minusrefreads > 0 and minusaltreads < minstrandreads and binom.cdf(minstrandreads, minusaltreads+minusrefreads,rawvaf, loc=0) < strandpvalue)):
nrec.filter.add("StrandSupport")

if ao > 0 and sb > sb_pvalue:
if 'dragen' not in callers and ao > 0 and sb > sb_pvalue:
nrec.filter.add("StrandBias")

if ao > 0 and failedreadbias > lqrb_pvalue:
if 'dragen' not in callers and ao > 0 and failedreadbias > lqrb_pvalue:
nrec.filter.add("LowQualReadBias")

if ao > 0 and readdat.shape[0] > 0:
if 'dragen' not in callers and ao > 0 and readdat.shape[0] > 0:
if readdat[(readdat["readquals"]=='pass')].shape[0] / readdat.shape[0] < minhqreads:
nrec.filter.add("LowQualReads")

Expand Down

0 comments on commit 969a877

Please sign in to comment.