Skip to content

Commit

Permalink
New output columns from TBProfiler Task (#217)
Browse files Browse the repository at this point in the history
* TBProfiler Task Update to include 5 new optinal columns: tbprofiler_additional_outputs_csv, tbprofiler_gene_name, tbprofiler_locus_tag, tbprofiler_variant_substitutions, tbprofiler_output_seq_method_type. All dependent on the conditional boolean input tbprofiler_additional_outputs

* add new tbprofiler outputs to merlin_magic, broad_terra_tools table, theiaprok_illumina_pe and theiaprok_illumina_se

* fix bug when annotation field was not present

* fix bug on additional csv file where column header was duplicated; restored number of tries to original value

* add same changes as tbprofiler task to tbprofiler_ont task

* update docker image

* fix issue when setting tbprofiler_additional_outputs=false and wdl failing due to missing files

* make cpu variable optional

* set tbprofiler_additional_outputs variable to false as default

* fix optional tbprofiler inputs not showing up on terra table

* propagate changes from tbprofiler to tbprofiler_ont

* expand list of genes of interest for internal report

* add confidence to internal table

* add rew csv report file for laboratorian

* add new tbprofiler tbprofiler_laboratorian_report_csv to theiaprok_illumina_pe, theiaprok_illumina_se and broad_terra_tools table

* add fix for when who_confidence field is missing an annotation, but exists

* add exceptions for when annotation field exists but it's empty, add exception for when depth is null instead of 0

* fix number of retries

* update low coverage warning to match min_depth defined in the tbprofiler run; update tbprofiler_ont to match tbprofiler task

* Add new warnings for: deletion; low depth of coverage and low frequency
  • Loading branch information
cimendes authored Mar 16, 2023
1 parent b964b01 commit 01a6eb4
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 19 deletions.
258 changes: 240 additions & 18 deletions tasks/species_typing/task_tbprofiler.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ task tbprofiler {
File read1
File? read2
String samplename
String tbprofiler_docker_image = "staphb/tbprofiler:4.3.0"
Boolean tbprofiler_additional_outputs
String output_seq_method_type
String tbprofiler_docker_image = "staphb/tbprofiler:4.4.2"
Int disk_size = 100
String? mapper = "bwa"
String? caller = "bcftools"
Int? min_depth = 10
Float? min_af = 0.1
Float? min_af_pred = 0.1
Int? cov_frac_threshold = 1
String mapper = "bwa"
String caller = "bcftools"
Int min_depth = 10
Float min_af = 0.1
Float min_af_pred = 0.1
Int cov_frac_threshold = 1
Int cpu = 8
}
command <<<
# update TBDB
Expand All @@ -30,7 +33,7 @@ task tbprofiler {
INPUT_READS="-1 ~{read1} -2 ~{read2}"
fi

# Run Kleborate on the input assembly with the --all flag and output with samplename prefix
# Run tb-profiler on the input reads with samplename prefix
tb-profiler profile \
${INPUT_READS} \
--prefix ~{samplename} \
Expand All @@ -46,6 +49,16 @@ task tbprofiler {
#Collate results
tb-profiler collate --prefix ~{samplename}

# touch optional output files because wdl
touch GENE_NAME LOCUS_TAG VARIANT_SUBSTITUTIONS OUTPUT_SEQ_METHOD_TYPE

# transform boolean tbprofiler_additional_outputs into string for python comparison
if ~{tbprofiler_additional_outputs}; then
export tbprofiler_additional_outputs="true"
else
export tbprofiler_additional_outputs="false"
fi

python3 <<CODE
import csv
with open("./~{samplename}.txt",'r') as tsv_file:
Expand Down Expand Up @@ -75,6 +88,92 @@ task tbprofiler {
res_genes.append(tsv_dict[i])
res_genes_string=';'.join(res_genes)
Resistance_Genes.write(res_genes_string)
import json
import os
if (os.environ["tbprofiler_additional_outputs"] == "true"):
gene_name = []
locus_tag = []
variant_substitutions = []
confidence = []
depth = []
frequency = []
with open("./results/~{samplename}.results.json") as results_json_fh:
results_json = json.load(results_json_fh)
for dr_variant in results_json["dr_variants"]: # reported mutation by tb-profiler, all confering resistance
gene_name.append(dr_variant["gene"])
locus_tag.append(dr_variant["locus_tag"])
variant_substitutions.append(dr_variant["type"] + ":" + dr_variant["nucleotide_change"] + "(" + dr_variant["protein_change"] + ")") # mutation_type:nt_sub(aa_sub)
depth.append(dr_variant["depth"])
frequency.append(dr_variant["freq"])
if "annotation" in dr_variant:
try: # sometimes annotation is an empty list
if dr_variant["annotation"][0]["who_confidence"] == "":
confidence.append("No WHO annotation")
else:
confidence.append(dr_variant["annotation"][0]["who_confidence"])
except:
confidence.append("No WHO annotation")
else:
confidence.append("No WHO annotation")
for other_variant in results_json["other_variants"]: # mutations not reported by tb-profiler
if other_variant["type"] != "synonymous_variant":
if other_variant["gene"] == "katG" or other_variant["gene"] == "pncA" or other_variant["gene"] == "rpoB" or other_variant["gene"] == "ethA" or other_variant["gene"] == "gid": # hardcoded for genes of interest that are reported to always confer resistance when mutated
gene_name.append(other_variant["gene"])
locus_tag.append(other_variant["locus_tag"])
variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")") # mutation_type:nt_sub(aa_sub)
depth.append(other_variant["depth"])
frequency.append(other_variant["freq"])
if "annotation" in other_variant:
try: # sometimes annotation is an empty list
if other_variant["annotation"][0]["who_confidence"] == "":
confidence.append("No WHO annotation")
else:
confidence.append(other_variant["annotation"][0]["who_confidence"])
except:
confidence.append("No WHO annotation")
else:
confidence.append("No WHO annotation")
else:
if "annotation" in other_variant: # check if who annotation field is present
for annotation in other_variant["annotation"]:
if annotation["who_confidence"] != "Not assoc w R" or annotation["who_confidence"] != "":
gene_name.append(other_variant["gene"])
locus_tag.append(other_variant["locus_tag"])
variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")") # mutation_type:nt_sub(aa_sub)
depth.append(other_variant["depth"])
frequency.append(other_variant["freq"])
confidence.append(annotation["who_confidence"])
with open("GENE_NAME", "wt") as gene_name_fh:
gene_name_fh.write(','.join(gene_name))
with open("LOCUS_TAG", "wt") as locus_tag_fh:
locus_tag_fh.write(','.join(locus_tag))
with open("VARIANT_SUBSTITUTIONS", "wt") as variant_substitutions_fh:
variant_substitutions_fh.write(','.join(variant_substitutions))
with open("OUTPUT_SEQ_METHOD_TYPE", "wt") as output_seq_method_type_fh:
output_seq_method_type_fh.write("~{output_seq_method_type}")
# file to be ingested into CDPH LIMS system
with open("tbprofiler_additional_outputs.csv", "wt") as additional_outputs_csv:
additional_outputs_csv.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,tbprofiler_output_seq_method_type\n")
additional_outputs_csv.write(";".join(gene_name) + "," + ";".join(locus_tag) + "," + ";".join(variant_substitutions) + ',' + ";".join(confidence) + ',' + "~{output_seq_method_type}")
# laboratorian report
with open("tbprofiler_laboratorian_report.csv", "wt") as report_fh:
report_fh.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,depth,frequency,warning\n")
for i in range(0, len(gene_name)):
if not depth[i]: # for cases when depth is null, it gets converted to 0
depth[i] = 0
warning = "Low depth coverage" if depth[i] < int('~{min_depth}') else "" # warning when coverage is lower than the defined 'min_depth' times
report_fh.write(gene_name[i] + ',' + locus_tag[i] + ',' + variant_substitutions[i] + ',' + confidence[i] + ',' + str(depth[i]) + ',' + str(frequency[i]) + ',' + warning + '\n')
CODE
>>>
output {
Expand All @@ -89,14 +188,20 @@ task tbprofiler {
String tbprofiler_num_dr_variants = read_string("NUM_DR_VARIANTS")
String tbprofiler_num_other_variants = read_string("NUM_OTHER_VARIANTS")
String tbprofiler_resistance_genes = read_string("RESISTANCE_GENES")
File? tbprofiler_additional_outputs_csv = "tbprofiler_additional_outputs.csv"
File? tbprofiler_laboratorian_report_csv = "tbprofiler_laboratorian_report.csv"
String tbprofiler_gene_name = read_string("GENE_NAME")
String tbprofiler_locus_tag = read_string("LOCUS_TAG")
String tbprofiler_variant_substitutions = read_string("VARIANT_SUBSTITUTIONS")
String tbprofiler_output_seq_method_type = read_string("OUTPUT_SEQ_METHOD_TYPE")
}
runtime {
docker: "~{tbprofiler_docker_image}"
memory: "16 GB"
cpu: 8
cpu: cpu
disks: "local-disk " + disk_size + " SSD"
disk: disk_size + " GB"
maxRetries: 3
maxRetries: 3
}
}
Expand All @@ -105,14 +210,17 @@ task tbprofiler_ont {
input {
File reads
String samplename
String tbprofiler_docker_image = "staphb/tbprofiler:4.3.0"
Boolean tbprofiler_additional_outputs
String output_seq_method_type
String tbprofiler_docker_image = "staphb/tbprofiler:4.4.2"
Int disk_size = 100
String? mapper = "bwa"
String? caller = "bcftools"
Int? min_depth = 10
Float? min_af = 0.1
Float? min_af_pred = 0.1
Int? cov_frac_threshold = 1
String mapper = "bwa"
String caller = "bcftools"
Int min_depth = 10
Float min_af = 0.1
Float min_af_pred = 0.1
Int cov_frac_threshold = 1
Int cpu = 8
}
command <<<
# update TBDB
Expand All @@ -129,6 +237,16 @@ task tbprofiler_ont {
#Collate results
tb-profiler collate --prefix ~{samplename}
# touch optional output files because wdl
touch GENE_NAME LOCUS_TAG VARIANT_SUBSTITUTIONS OUTPUT_SEQ_METHOD_TYPE
# transform boolean tbprofiler_additional_outputs into string for python comparison
if ~{tbprofiler_additional_outputs}; then
export tbprofiler_additional_outputs="true"
else
export tbprofiler_additional_outputs="false"
fi
python3 <<CODE
import csv
with open("./~{samplename}.txt",'r') as tsv_file:
Expand Down Expand Up @@ -158,6 +276,104 @@ task tbprofiler_ont {
res_genes.append(tsv_dict[i])
res_genes_string=';'.join(res_genes)
Resistance_Genes.write(res_genes_string)
import json
import os
if (os.environ["tbprofiler_additional_outputs"] == "true"):
gene_name = []
locus_tag = []
variant_substitutions = []
confidence = []
depth = []
frequency = []
with open("./results/~{samplename}.results.json") as results_json_fh:
results_json = json.load(results_json_fh)
for dr_variant in results_json["dr_variants"]: # reported mutation by tb-profiler, all confering resistance
gene_name.append(dr_variant["gene"])
locus_tag.append(dr_variant["locus_tag"])
variant_substitutions.append(dr_variant["type"] + ":" + dr_variant["nucleotide_change"] + "(" + dr_variant["protein_change"] + ")") # mutation_type:nt_sub(aa_sub)
depth.append(dr_variant["depth"])
frequency.append(dr_variant["freq"])
if "annotation" in dr_variant:
try: # sometimes annotation is an empty list
if dr_variant["annotation"][0]["who_confidence"] == "":
confidence.append("No WHO annotation")
else:
confidence.append(dr_variant["annotation"][0]["who_confidence"])
except:
confidence.append("No WHO annotation")
else:
confidence.append("No WHO annotation")
for other_variant in results_json["other_variants"]: # mutations not reported by tb-profiler
if other_variant["type"] != "synonymous_variant":
if other_variant["gene"] == "katG" or other_variant["gene"] == "pncA" or other_variant["gene"] == "rpoB" or other_variant["gene"] == "ethA" or other_variant["gene"] == "gid": # hardcoded for genes of interest that are reported to always confer resistance when mutated
gene_name.append(other_variant["gene"])
locus_tag.append(other_variant["locus_tag"])
variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")") # mutation_type:nt_sub(aa_sub)
depth.append(other_variant["depth"])
frequency.append(other_variant["freq"])
if "annotation" in other_variant:
try: # sometimes annotation is an empty list
if other_variant["annotation"][0]["who_confidence"] == "":
confidence.append("No WHO annotation")
else:
confidence.append(other_variant["annotation"][0]["who_confidence"])
except:
confidence.append("No WHO annotation")
else:
confidence.append("No WHO annotation")
else:
if "annotation" in other_variant: # check if who annotation field is present
for annotation in other_variant["annotation"]:
if annotation["who_confidence"] != "Not assoc w R" or annotation["who_confidence"] != "":
gene_name.append(other_variant["gene"])
locus_tag.append(other_variant["locus_tag"])
variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")") # mutation_type:nt_sub(aa_sub)
depth.append(other_variant["depth"])
frequency.append(other_variant["freq"])
confidence.append(annotation["who_confidence"])
with open("GENE_NAME", "wt") as gene_name_fh:
gene_name_fh.write(','.join(gene_name))
with open("LOCUS_TAG", "wt") as locus_tag_fh:
locus_tag_fh.write(','.join(locus_tag))
with open("VARIANT_SUBSTITUTIONS", "wt") as variant_substitutions_fh:
variant_substitutions_fh.write(','.join(variant_substitutions))
with open("OUTPUT_SEQ_METHOD_TYPE", "wt") as output_seq_method_type_fh:
output_seq_method_type_fh.write("~{output_seq_method_type}")
# file to be ingested into CDPH LIMS system
with open("tbprofiler_additional_outputs.csv", "wt") as additional_outputs_csv:
additional_outputs_csv.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,tbprofiler_output_seq_method_type\n")
additional_outputs_csv.write(";".join(gene_name) + "," + ";".join(locus_tag) + "," + ";".join(variant_substitutions) + ',' + ";".join(confidence) + ',' + "~{output_seq_method_type}")
# laboratorian report
with open("tbprofiler_laboratorian_report.csv", "wt") as report_fh:
report_fh.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,depth,frequency,warning\n")
for i in range(0, len(gene_name)):
warning = []
if not depth[i]: # for cases when depth is null
depth[i] = "NA" # Deletion instead of value
warning.append("Deletion")
else:
if depth[i] < int('~{min_depth}'): # warning when coverage is lower than the defined 'min_depth' times
warning.append("Low depth coverage")
if not frequency[i]:
warning.append("No frequency")
else:
if frequency[i] < 0.05: # warning when frequency is lower than 5%
warning.append("Low frequency")
report_fh.write(gene_name[i] + ',' + locus_tag[i] + ',' + variant_substitutions[i] + ',' + confidence[i] + ',' + str(depth[i]) + ',' + str(frequency[i]) + ',' + ';'.join(warning) + '\n')
CODE
>>>
output {
Expand All @@ -172,11 +388,17 @@ task tbprofiler_ont {
String tbprofiler_num_dr_variants = read_string("NUM_DR_VARIANTS")
String tbprofiler_num_other_variants = read_string("NUM_OTHER_VARIANTS")
String tbprofiler_resistance_genes = read_string("RESISTANCE_GENES")
File? tbprofiler_additional_outputs_csv = "tbprofiler_additional_outputs.csv"
File? tbprofiler_laboratorian_report_csv = "tbprofiler_laboratorian_report.csv"
String tbprofiler_gene_name = read_string("GENE_NAME")
String tbprofiler_locus_tag = read_string("LOCUS_TAG")
String tbprofiler_variant_substitutions = read_string("VARIANT_SUBSTITUTIONS")
String tbprofiler_output_seq_method_type = read_string("OUTPUT_SEQ_METHOD_TYPE")
}
runtime {
docker: "~{tbprofiler_docker_image}"
memory: "16 GB"
cpu: 8
cpu: cpu
disks: "local-disk " + disk_size + " SSD"
disk: disk_size + " GB"
maxRetries: 3
Expand Down
12 changes: 12 additions & 0 deletions tasks/utilities/task_broad_terra_tools.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,12 @@ task export_taxon_tables {
String? tbprofiler_sub_lineage
String? tbprofiler_dr_type
String? tbprofiler_resistance_genes
File? tbprofiler_additional_outputs_csv
File? tbprofiler_laboratorian_report_csv
String? tbprofiler_gene_name
String? tbprofiler_locus_tag
String? tbprofiler_variant_substitutions
String? tbprofiler_output_seq_method_type
File? legsta_results
String? legsta_predicted_sbt
String? legsta_version
Expand Down Expand Up @@ -485,6 +491,12 @@ task export_taxon_tables {
"tbprofiler_sub_lineage": "~{tbprofiler_sub_lineage}",
"tbprofiler_dr_type": "~{tbprofiler_dr_type}",
"tbprofiler_resistance_genes": "~{tbprofiler_resistance_genes}",
"tbprofiler_additional_outputs_csv": "~{tbprofiler_additional_outputs_csv}",
"tbprofiler_laboratorian_report_csv": "~{tbprofiler_laboratorian_report_csv}"
"tbprofiler_gene_name": "~{tbprofiler_gene_name}",
"tbprofiler_locus_tag": "~{tbprofiler_locus_tag}",
"tbprofiler_variant_substitutions": "~{tbprofiler_variant_substitutions}",
"tbprofiler_output_seq_method_type": "~{tbprofiler_output_seq_method_type}",
"amrfinderplus_all_report": "~{amrfinderplus_all_report}",
"amrfinderplus_amr_report": "~{amrfinderplus_amr_report}",
"amrfinderplus_stress_report": "~{amrfinderplus_stress_report}",
Expand Down
12 changes: 11 additions & 1 deletion workflows/wf_merlin_magic.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ workflow merlin_magic {
Boolean call_poppunk = true
Boolean read1_is_ont = false
Boolean call_shigeifinder_reads_input = false
Boolean tbprofiler_additional_outputs = false
String output_seq_method_type = "WGS"
}
if (merlin_tag == "Acinetobacter baumannii") {
call kaptive.kaptive {
Expand Down Expand Up @@ -165,7 +167,9 @@ workflow merlin_magic {
input:
read1 = read1,
read2 = read2,
samplename = samplename
samplename = samplename,
tbprofiler_additional_outputs = tbprofiler_additional_outputs,
output_seq_method_type = output_seq_method_type
}
}
if (merlin_tag == "Legionella pneumophila") {
Expand Down Expand Up @@ -340,6 +344,12 @@ workflow merlin_magic {
String? tbprofiler_sub_lineage = tbprofiler.tbprofiler_sub_lineage
String? tbprofiler_dr_type = tbprofiler.tbprofiler_dr_type
String? tbprofiler_resistance_genes = tbprofiler.tbprofiler_resistance_genes
File? tbprofiler_additional_outputs_csv = tbprofiler.tbprofiler_additional_outputs_csv
File? tbprofiler_laboratorian_report_csv = tbprofiler.tbprofiler_laboratorian_report_csv
String? tbprofiler_gene_name = tbprofiler.tbprofiler_gene_name
String? tbprofiler_locus_tag = tbprofiler.tbprofiler_locus_tag
String? tbprofiler_variant_substitutions = tbprofiler.tbprofiler_variant_substitutions
String? tbprofiler_output_seq_method_type = tbprofiler.tbprofiler_output_seq_method_type
# Legionella pneumophila Typing
File? legsta_results = legsta.legsta_results
String? legsta_predicted_sbt = legsta.legsta_predicted_sbt
Expand Down
Loading

0 comments on commit 01a6eb4

Please sign in to comment.