New output columns from TBProfiler Task (#217)

* TBProfiler Task Update to include 5 new optinal columns: tbprofiler_additional_outputs_csv, tbprofiler_gene_name, tbprofiler_locus_tag, tbprofiler_variant_substitutions, tbprofiler_output_seq_method_type. All dependent on the conditional boolean input tbprofiler_additional_outputs * add new tbprofiler outputs to merlin_magic, broad_terra_tools table, theiaprok_illumina_pe and theiaprok_illumina_se * fix bug when annotation field was not present * fix bug on additional csv file where column header was duplicated; restored number of tries to original value * add same changes as tbprofiler task to tbprofiler_ont task * update docker image * fix issue when setting tbprofiler_additional_outputs=false and wdl failing due to missing files * make cpu variable optional * set tbprofiler_additional_outputs variable to false as default * fix optional tbprofiler inputs not showing up on terra table * propagate changes from tbprofiler to tbprofiler_ont * expand list of genes of interest for internal report * add confidence to internal table * add rew csv report file for laboratorian * add new tbprofiler tbprofiler_laboratorian_report_csv to theiaprok_illumina_pe, theiaprok_illumina_se and broad_terra_tools table * add fix for when who_confidence field is missing an annotation, but exists * add exceptions for when annotation field exists but it's empty, add exception for when depth is null instead of 0 * fix number of retries * update low coverage warning to match min_depth defined in the tbprofiler run; update tbprofiler_ont to match tbprofiler task * Add new warnings for: deletion; low depth of coverage and low frequency
theiagen · Mar 16, 2023 · 01a6eb4 · 01a6eb4
1 parent b964b01
commit 01a6eb4
Show file tree

Hide file tree

Showing 5 changed files with 275 additions and 19 deletions.
diff --git a/tasks/species_typing/task_tbprofiler.wdl b/tasks/species_typing/task_tbprofiler.wdl
@@ -6,14 +6,17 @@ task tbprofiler {
     File read1
     File? read2
     String samplename
-    String tbprofiler_docker_image = "staphb/tbprofiler:4.3.0"
+    Boolean tbprofiler_additional_outputs
+    String output_seq_method_type
+    String tbprofiler_docker_image = "staphb/tbprofiler:4.4.2"
     Int disk_size = 100
-    String? mapper = "bwa"
-    String? caller = "bcftools"
-    Int? min_depth = 10
-    Float? min_af = 0.1
-    Float? min_af_pred = 0.1
-    Int? cov_frac_threshold = 1
+    String mapper = "bwa"
+    String caller = "bcftools"
+    Int min_depth = 10
+    Float min_af = 0.1
+    Float min_af_pred = 0.1
+    Int cov_frac_threshold = 1
+    Int cpu = 8 
   }
   command <<<
     # update TBDB
@@ -30,7 +33,7 @@ task tbprofiler {
       INPUT_READS="-1 ~{read1} -2 ~{read2}"
     fi
 
-    # Run Kleborate on the input assembly with the --all flag and output with samplename prefix
+    # Run tb-profiler on the input reads with samplename prefix
     tb-profiler profile \
       ${INPUT_READS} \
       --prefix ~{samplename} \
@@ -46,6 +49,16 @@ task tbprofiler {
     #Collate results
     tb-profiler collate --prefix ~{samplename}
 
+    # touch optional output files because wdl
+    touch GENE_NAME LOCUS_TAG VARIANT_SUBSTITUTIONS OUTPUT_SEQ_METHOD_TYPE
+
+    # transform boolean tbprofiler_additional_outputs into string for python comparison
+    if ~{tbprofiler_additional_outputs}; then
+      export tbprofiler_additional_outputs="true"
+    else 
+      export tbprofiler_additional_outputs="false"
+    fi
+
     python3 <<CODE
     import csv
     with open("./~{samplename}.txt",'r') as tsv_file:
@@ -75,6 +88,92 @@ task tbprofiler {
             res_genes.append(tsv_dict[i])
         res_genes_string=';'.join(res_genes)
         Resistance_Genes.write(res_genes_string)
+    
+    import json
+    import os
+    if (os.environ["tbprofiler_additional_outputs"] == "true"):
+
+      gene_name = []
+      locus_tag = []
+      variant_substitutions = []
+      confidence = []
+      depth = []
+      frequency = []
+
+      with open("./results/~{samplename}.results.json") as results_json_fh:
+        results_json = json.load(results_json_fh)
+        for dr_variant in results_json["dr_variants"]:  # reported mutation by tb-profiler, all confering resistance
+          gene_name.append(dr_variant["gene"])
+          locus_tag.append(dr_variant["locus_tag"])  
+          variant_substitutions.append(dr_variant["type"] + ":" + dr_variant["nucleotide_change"] + "(" + dr_variant["protein_change"] + ")")  # mutation_type:nt_sub(aa_sub)
+          depth.append(dr_variant["depth"])
+          frequency.append(dr_variant["freq"])
+          if "annotation" in dr_variant:
+            try:  # sometimes annotation is an empty list
+              if dr_variant["annotation"][0]["who_confidence"] == "":
+                confidence.append("No WHO annotation")
+              else:
+                confidence.append(dr_variant["annotation"][0]["who_confidence"])
+            except:
+              confidence.append("No WHO annotation")
+          else:
+            confidence.append("No WHO annotation")
+        
+        for other_variant in results_json["other_variants"]:  # mutations not reported by tb-profiler
+          if other_variant["type"] != "synonymous_variant":
+            if other_variant["gene"] == "katG" or other_variant["gene"] == "pncA" or other_variant["gene"] == "rpoB" or other_variant["gene"] == "ethA" or other_variant["gene"] == "gid":  # hardcoded for genes of interest that are reported to always confer resistance when mutated
+              gene_name.append(other_variant["gene"])
+              locus_tag.append(other_variant["locus_tag"])  
+              variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")")  # mutation_type:nt_sub(aa_sub)
+              depth.append(other_variant["depth"])
+              frequency.append(other_variant["freq"])
+              if "annotation" in other_variant:
+                try:  # sometimes annotation is an empty list
+                  if other_variant["annotation"][0]["who_confidence"] == "":
+                    confidence.append("No WHO annotation")
+                  else:
+                    confidence.append(other_variant["annotation"][0]["who_confidence"])
+                except:
+                  confidence.append("No WHO annotation")
+              else:
+                confidence.append("No WHO annotation")
+            else:
+              if "annotation" in other_variant:  # check if who annotation field is present
+                for annotation in other_variant["annotation"]:
+                  if annotation["who_confidence"] != "Not assoc w R" or annotation["who_confidence"] != "":
+                    gene_name.append(other_variant["gene"])
+                    locus_tag.append(other_variant["locus_tag"])  
+                    variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")")  # mutation_type:nt_sub(aa_sub)
+                    depth.append(other_variant["depth"])
+                    frequency.append(other_variant["freq"])
+                    confidence.append(annotation["who_confidence"])
+        
+        with open("GENE_NAME", "wt") as gene_name_fh:
+          gene_name_fh.write(','.join(gene_name))
+        
+        with open("LOCUS_TAG", "wt") as locus_tag_fh:
+          locus_tag_fh.write(','.join(locus_tag))
+        
+        with open("VARIANT_SUBSTITUTIONS", "wt") as variant_substitutions_fh:
+          variant_substitutions_fh.write(','.join(variant_substitutions))
+        
+        with open("OUTPUT_SEQ_METHOD_TYPE", "wt") as output_seq_method_type_fh:
+          output_seq_method_type_fh.write("~{output_seq_method_type}")
+
+        # file to be ingested into CDPH LIMS system
+        with open("tbprofiler_additional_outputs.csv", "wt") as additional_outputs_csv:
+          additional_outputs_csv.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,tbprofiler_output_seq_method_type\n")
+          additional_outputs_csv.write(";".join(gene_name) + "," + ";".join(locus_tag) + "," + ";".join(variant_substitutions) + ',' + ";".join(confidence) + ',' + "~{output_seq_method_type}")
+    
+        # laboratorian report
+        with open("tbprofiler_laboratorian_report.csv", "wt") as report_fh:
+          report_fh.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,depth,frequency,warning\n")
+          for i in range(0, len(gene_name)):
+            if not depth[i]:  # for cases when depth is null, it gets converted to 0
+              depth[i] = 0
+            warning = "Low depth coverage" if  depth[i] < int('~{min_depth}') else "" # warning when coverage is lower than the defined 'min_depth' times
+            report_fh.write(gene_name[i] + ',' + locus_tag[i] + ',' + variant_substitutions[i] + ',' + confidence[i] + ',' + str(depth[i]) + ',' + str(frequency[i]) + ',' + warning + '\n')
+
     CODE
   >>>
   output {
@@ -89,14 +188,20 @@ task tbprofiler {
     String tbprofiler_num_dr_variants = read_string("NUM_DR_VARIANTS")
     String tbprofiler_num_other_variants = read_string("NUM_OTHER_VARIANTS")
     String tbprofiler_resistance_genes = read_string("RESISTANCE_GENES")
+    File? tbprofiler_additional_outputs_csv = "tbprofiler_additional_outputs.csv"
+    File? tbprofiler_laboratorian_report_csv = "tbprofiler_laboratorian_report.csv"
+    String tbprofiler_gene_name = read_string("GENE_NAME")
+    String tbprofiler_locus_tag = read_string("LOCUS_TAG")
+    String tbprofiler_variant_substitutions = read_string("VARIANT_SUBSTITUTIONS")
+    String tbprofiler_output_seq_method_type = read_string("OUTPUT_SEQ_METHOD_TYPE")
   }
   runtime {
     docker: "~{tbprofiler_docker_image}"
     memory: "16 GB"
-    cpu: 8
+    cpu: cpu
     disks: "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB"
-    maxRetries: 3
+    maxRetries: 3 
   }
 }
 
@@ -105,14 +210,17 @@ task tbprofiler_ont {
   input {
     File reads
     String samplename
-    String tbprofiler_docker_image = "staphb/tbprofiler:4.3.0"
+    Boolean tbprofiler_additional_outputs
+    String output_seq_method_type
+    String tbprofiler_docker_image = "staphb/tbprofiler:4.4.2"
     Int disk_size = 100
-    String? mapper = "bwa"
-    String? caller = "bcftools"
-    Int? min_depth = 10
-    Float? min_af = 0.1
-    Float? min_af_pred = 0.1
-    Int? cov_frac_threshold = 1
+    String mapper = "bwa"
+    String caller = "bcftools"
+    Int min_depth = 10
+    Float min_af = 0.1
+    Float min_af_pred = 0.1
+    Int cov_frac_threshold = 1
+    Int cpu = 8
   }
   command <<<
     # update TBDB
@@ -129,6 +237,16 @@ task tbprofiler_ont {
     #Collate results
     tb-profiler collate --prefix ~{samplename}
 
+    # touch optional output files because wdl
+    touch GENE_NAME LOCUS_TAG VARIANT_SUBSTITUTIONS OUTPUT_SEQ_METHOD_TYPE
+
+    # transform boolean tbprofiler_additional_outputs into string for python comparison
+    if ~{tbprofiler_additional_outputs}; then
+      export tbprofiler_additional_outputs="true"
+    else 
+      export tbprofiler_additional_outputs="false"
+    fi
+
     python3 <<CODE
     import csv
     with open("./~{samplename}.txt",'r') as tsv_file:
@@ -158,6 +276,104 @@ task tbprofiler_ont {
             res_genes.append(tsv_dict[i])
         res_genes_string=';'.join(res_genes)
         Resistance_Genes.write(res_genes_string)
+    
+    import json
+    import os
+    if (os.environ["tbprofiler_additional_outputs"] == "true"):
+
+      gene_name = []
+      locus_tag = []
+      variant_substitutions = []
+      confidence = []
+      depth = []
+      frequency = []
+
+      with open("./results/~{samplename}.results.json") as results_json_fh:
+        results_json = json.load(results_json_fh)
+        for dr_variant in results_json["dr_variants"]:  # reported mutation by tb-profiler, all confering resistance
+          gene_name.append(dr_variant["gene"])
+          locus_tag.append(dr_variant["locus_tag"])  
+          variant_substitutions.append(dr_variant["type"] + ":" + dr_variant["nucleotide_change"] + "(" + dr_variant["protein_change"] + ")")  # mutation_type:nt_sub(aa_sub)
+          depth.append(dr_variant["depth"])
+          frequency.append(dr_variant["freq"])
+          if "annotation" in dr_variant:
+            try:  # sometimes annotation is an empty list
+              if dr_variant["annotation"][0]["who_confidence"] == "":
+                confidence.append("No WHO annotation")
+              else:
+                confidence.append(dr_variant["annotation"][0]["who_confidence"])
+            except:
+              confidence.append("No WHO annotation")
+          else:
+            confidence.append("No WHO annotation")
+        
+        for other_variant in results_json["other_variants"]:  # mutations not reported by tb-profiler
+          if other_variant["type"] != "synonymous_variant":
+            if other_variant["gene"] == "katG" or other_variant["gene"] == "pncA" or other_variant["gene"] == "rpoB" or other_variant["gene"] == "ethA" or other_variant["gene"] == "gid":  # hardcoded for genes of interest that are reported to always confer resistance when mutated
+              gene_name.append(other_variant["gene"])
+              locus_tag.append(other_variant["locus_tag"])  
+              variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")")  # mutation_type:nt_sub(aa_sub)
+              depth.append(other_variant["depth"])
+              frequency.append(other_variant["freq"])
+              if "annotation" in other_variant:
+                try:  # sometimes annotation is an empty list
+                  if other_variant["annotation"][0]["who_confidence"] == "":
+                    confidence.append("No WHO annotation")
+                  else:
+                    confidence.append(other_variant["annotation"][0]["who_confidence"])
+                except:
+                  confidence.append("No WHO annotation")
+              else:
+                confidence.append("No WHO annotation")
+            else:
+              if "annotation" in other_variant:  # check if who annotation field is present
+                for annotation in other_variant["annotation"]:
+                  if annotation["who_confidence"] != "Not assoc w R" or annotation["who_confidence"] != "":
+                    gene_name.append(other_variant["gene"])
+                    locus_tag.append(other_variant["locus_tag"])  
+                    variant_substitutions.append(other_variant["type"] + ":" + other_variant["nucleotide_change"] + "(" + other_variant["protein_change"] + ")")  # mutation_type:nt_sub(aa_sub)
+                    depth.append(other_variant["depth"])
+                    frequency.append(other_variant["freq"])
+                    confidence.append(annotation["who_confidence"])
+        
+        with open("GENE_NAME", "wt") as gene_name_fh:
+          gene_name_fh.write(','.join(gene_name))
+        
+        with open("LOCUS_TAG", "wt") as locus_tag_fh:
+          locus_tag_fh.write(','.join(locus_tag))
+        
+        with open("VARIANT_SUBSTITUTIONS", "wt") as variant_substitutions_fh:
+          variant_substitutions_fh.write(','.join(variant_substitutions))
+        
+        with open("OUTPUT_SEQ_METHOD_TYPE", "wt") as output_seq_method_type_fh:
+          output_seq_method_type_fh.write("~{output_seq_method_type}")
+
+        # file to be ingested into CDPH LIMS system
+        with open("tbprofiler_additional_outputs.csv", "wt") as additional_outputs_csv:
+          additional_outputs_csv.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,tbprofiler_output_seq_method_type\n")
+          additional_outputs_csv.write(";".join(gene_name) + "," + ";".join(locus_tag) + "," + ";".join(variant_substitutions) + ',' + ";".join(confidence) + ',' + "~{output_seq_method_type}")
+    
+        # laboratorian report
+        with open("tbprofiler_laboratorian_report.csv", "wt") as report_fh:
+          report_fh.write("tbprofiler_gene_name,tbprofiler_locus_tag,tbprofiler_variant_substitutions,confidence,depth,frequency,warning\n")
+          
+          for i in range(0, len(gene_name)):
+          
+            warning = []
+            if not depth[i]:  # for cases when depth is null
+              depth[i] = "NA"  # Deletion instead of value
+              warning.append("Deletion")
+            else:
+              if depth[i] < int('~{min_depth}'): # warning when coverage is lower than the defined 'min_depth' times
+                warning.append("Low depth coverage") 
+            if not frequency[i]:
+              warning.append("No frequency")
+            else:
+              if frequency[i] < 0.05: # warning when frequency is lower than 5%
+                warning.append("Low frequency") 
+
+            report_fh.write(gene_name[i] + ',' + locus_tag[i] + ',' + variant_substitutions[i] + ',' + confidence[i] + ',' + str(depth[i]) + ',' + str(frequency[i]) + ',' + ';'.join(warning) + '\n')
+
     CODE
   >>>
   output {
@@ -172,11 +388,17 @@ task tbprofiler_ont {
     String tbprofiler_num_dr_variants = read_string("NUM_DR_VARIANTS")
     String tbprofiler_num_other_variants = read_string("NUM_OTHER_VARIANTS")
     String tbprofiler_resistance_genes = read_string("RESISTANCE_GENES")
+    File? tbprofiler_additional_outputs_csv = "tbprofiler_additional_outputs.csv"
+    File? tbprofiler_laboratorian_report_csv = "tbprofiler_laboratorian_report.csv"
+    String tbprofiler_gene_name = read_string("GENE_NAME")
+    String tbprofiler_locus_tag = read_string("LOCUS_TAG")
+    String tbprofiler_variant_substitutions = read_string("VARIANT_SUBSTITUTIONS")
+    String tbprofiler_output_seq_method_type = read_string("OUTPUT_SEQ_METHOD_TYPE")
   }
   runtime {
     docker: "~{tbprofiler_docker_image}"
     memory: "16 GB"
-    cpu: 8
+    cpu: cpu
     disks: "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB"
     maxRetries: 3

diff --git a/tasks/utilities/task_broad_terra_tools.wdl b/tasks/utilities/task_broad_terra_tools.wdl
@@ -220,6 +220,12 @@ task export_taxon_tables {
     String? tbprofiler_sub_lineage
     String? tbprofiler_dr_type
     String? tbprofiler_resistance_genes
+    File? tbprofiler_additional_outputs_csv
+    File? tbprofiler_laboratorian_report_csv
+    String? tbprofiler_gene_name
+    String? tbprofiler_locus_tag
+    String? tbprofiler_variant_substitutions
+    String? tbprofiler_output_seq_method_type
     File? legsta_results
     String? legsta_predicted_sbt
     String? legsta_version
@@ -485,6 +491,12 @@ task export_taxon_tables {
       "tbprofiler_sub_lineage": "~{tbprofiler_sub_lineage}",
       "tbprofiler_dr_type": "~{tbprofiler_dr_type}",
       "tbprofiler_resistance_genes": "~{tbprofiler_resistance_genes}",
+      "tbprofiler_additional_outputs_csv": "~{tbprofiler_additional_outputs_csv}",
+      "tbprofiler_laboratorian_report_csv": "~{tbprofiler_laboratorian_report_csv}"
+      "tbprofiler_gene_name": "~{tbprofiler_gene_name}",
+      "tbprofiler_locus_tag": "~{tbprofiler_locus_tag}",
+      "tbprofiler_variant_substitutions": "~{tbprofiler_variant_substitutions}",
+      "tbprofiler_output_seq_method_type": "~{tbprofiler_output_seq_method_type}",
       "amrfinderplus_all_report": "~{amrfinderplus_all_report}",
       "amrfinderplus_amr_report": "~{amrfinderplus_amr_report}",
       "amrfinderplus_stress_report": "~{amrfinderplus_stress_report}",

diff --git a/workflows/wf_merlin_magic.wdl b/workflows/wf_merlin_magic.wdl
@@ -39,6 +39,8 @@ workflow merlin_magic {
     Boolean call_poppunk = true
     Boolean read1_is_ont = false
     Boolean call_shigeifinder_reads_input = false
+    Boolean tbprofiler_additional_outputs = false
+    String output_seq_method_type = "WGS"
   }
     if (merlin_tag == "Acinetobacter baumannii") {
     call kaptive.kaptive {
@@ -165,7 +167,9 @@ workflow merlin_magic {
       input:
         read1 = read1,
         read2 = read2,
-        samplename = samplename
+        samplename = samplename,
+        tbprofiler_additional_outputs = tbprofiler_additional_outputs,
+        output_seq_method_type = output_seq_method_type
     }
   }
   if (merlin_tag == "Legionella pneumophila") {
@@ -340,6 +344,12 @@ workflow merlin_magic {
   String? tbprofiler_sub_lineage = tbprofiler.tbprofiler_sub_lineage
   String? tbprofiler_dr_type = tbprofiler.tbprofiler_dr_type
   String? tbprofiler_resistance_genes = tbprofiler.tbprofiler_resistance_genes
+  File? tbprofiler_additional_outputs_csv = tbprofiler.tbprofiler_additional_outputs_csv
+  File? tbprofiler_laboratorian_report_csv = tbprofiler.tbprofiler_laboratorian_report_csv
+  String? tbprofiler_gene_name = tbprofiler.tbprofiler_gene_name
+  String? tbprofiler_locus_tag = tbprofiler.tbprofiler_locus_tag
+  String? tbprofiler_variant_substitutions = tbprofiler.tbprofiler_variant_substitutions
+  String? tbprofiler_output_seq_method_type = tbprofiler.tbprofiler_output_seq_method_type
   # Legionella pneumophila Typing
   File? legsta_results = legsta.legsta_results
   String? legsta_predicted_sbt = legsta.legsta_predicted_sbt