Merge pull request #368 from broadinstitute/dp-lineages

multi-sample lineage tasks
broadinstitute · Oct 1, 2021 · 687f21f · 687f21f
2 parents e4b400a + 7d9379a
commit 687f21f
Show file tree

Hide file tree

Showing 9 changed files with 284 additions and 141 deletions.
diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -586,65 +586,6 @@ task filter_subsample_sequences {
     }
 }
 
-task filter_sequences_by_length {
-    meta {
-        description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases."
-    }
-    input {
-        File   sequences_fasta
-        Int    min_non_N = 1
-
-        String docker = "quay.io/broadinstitute/viral-core:2.1.32"
-    }
-    parameter_meta {
-        sequences_fasta: {
-          description: "Set of sequences in fasta format",
-          patterns: ["*.fasta", "*.fa"]
-        }
-        min_non_N: {
-          description: "Minimum number of called bases (non-N, non-gap, A, T, C, G, and other non-N ambiguity codes accepted)"
-        }
-    }
-    String out_fname = sub(basename(sequences_fasta), ".fasta", ".filtered.fasta")
-    command <<<
-    python3 <<CODE
-    import Bio.SeqIO
-    import gzip
-    n_total = 0
-    n_kept = 0
-    open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
-    with open_or_gzopen('~{sequences_fasta}', 'rt') as inf:
-        with open_or_gzopen('~{out_fname}', 'wt') as outf:
-            for seq in Bio.SeqIO.parse(inf, 'fasta'):
-                n_total += 1
-                ungapseq = seq.seq.ungap().upper()
-                if (len(ungapseq) - ungapseq.count('N')) >= ~{min_non_N}:
-                    n_kept += 1
-                    Bio.SeqIO.write(seq, outf, 'fasta')
-    n_dropped = n_total-n_kept
-    with open('IN_COUNT', 'wt') as outf:
-        outf.write(str(n_total)+'\n')
-    with open('OUT_COUNT', 'wt') as outf:
-        outf.write(str(n_kept)+'\n')
-    with open('DROP_COUNT', 'wt') as outf:
-        outf.write(str(n_dropped)+'\n')
-    CODE
-    >>>
-    runtime {
-        docker: docker
-        memory: "1 GB"
-        cpu :   1
-        disks:  "local-disk 300 HDD"
-        dx_instance_type: "mem1_ssd1_v2_x2"
-    }
-    output {
-        File filtered_fasta    = out_fname
-        Int  sequences_in      = read_int("IN_COUNT")
-        Int  sequences_dropped = read_int("DROP_COUNT")
-        Int  sequences_out     = read_int("OUT_COUNT")
-    }
-}
-
 task filter_sequences_to_list {
     meta {
         description: "Filter and subsample a sequence set to a specific list of ids in a text file (one id per line)."

diff --git a/pipes/WDL/tasks/tasks_sarscov2.wdl b/pipes/WDL/tasks/tasks_sarscov2.wdl
@@ -85,7 +85,7 @@ task nextclade_many_samples {
         String       basename
         String       docker = "nextstrain/nextclade:1.2.3"
     }
-    command {
+    command <<<
         set -e
         apt-get update
         apt-get -y install python3
@@ -112,7 +112,36 @@ task nextclade_many_samples {
             --output-json "~{basename}".nextclade.json \
             --output-tsv  "~{basename}".nextclade.tsv \
             --output-tree "~{basename}".nextclade.auspice.json
-    }
+
+        cp genomes.aligned.fasta "~{basename}".nextalign.msa.fasta
+
+        python3 <<CODE
+        # transpose table
+        import codecs, csv, json
+        out_maps = {'clade':{}, 'aaSubstitutions':{}, 'aaDeletions':{}}
+        with codecs.open('~{basename}.nextclade.tsv', 'r', encoding='utf-8') as inf:
+            with codecs.open('NEXTCLADE_CLADE', 'w', encoding='utf-8') as outf_clade:
+                with codecs.open('NEXTCLADE_AASUBS', 'w', encoding='utf-8') as outf_aasubs:
+                    with codecs.open('NEXTCLADE_AADELS', 'w', encoding='utf-8') as outf_aadels:
+                        for row in csv.DictReader(inf, delimiter='\t'):
+                            outf_clade.write('\t'.join([row['seqName'], row['clade']])+'\n')
+                            outf_aasubs.write('\t'.join([row['seqName'], row['aaSubstitutions']])+'\n')
+                            outf_aadels.write('\t'.join([row['seqName'], row['aaDeletions']])+'\n')
+                            for k in ('clade','aaSubstitutions','aaDeletions'):
+                                out_maps[k][row['seqName']] = row[k]
+        with codecs.open('NEXTCLADE_CLADE.json', 'w', encoding='utf-8') as outf:
+            json.dump(out_maps['clade'], outf)
+        with codecs.open('NEXTCLADE_AASUBS.json', 'w', encoding='utf-8') as outf:
+            json.dump(out_maps['aaSubstitutions'], outf)
+        with codecs.open('NEXTCLADE_AADELS.json', 'w', encoding='utf-8') as outf:
+            json.dump(out_maps['aaDeletions'], outf)
+        CODE
+
+        # gather runtime metrics
+        cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+        cat /proc/loadavg > CPU_LOAD
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
+    >>>
     runtime {
         docker: docker
         memory: "14 GB"
@@ -121,10 +150,20 @@ task nextclade_many_samples {
         dx_instance_type: "mem1_ssd1_v2_x16"
     }
     output {
-        String nextclade_version = read_string("VERSION")
-        File   nextclade_json    = "~{basename}.nextclade.json"
-        File   auspice_json      = "~{basename}.nextclade.auspice.json"
-        File   nextclade_tsv     = "~{basename}.nextclade.tsv"
+        #Map[String,String] nextclade_clade   = read_map("NEXTCLADE_CLADE")
+        #Map[String,String] aa_subs_csv       = read_map("NEXTCLADE_AASUBS")
+        #Map[String,String] aa_dels_csv       = read_map("NEXTCLADE_AADELS")
+        Map[String,String] nextclade_clade   = read_json("NEXTCLADE_CLADE.json")
+        Map[String,String] aa_subs_csv       = read_json("NEXTCLADE_AASUBS.json")
+        Map[String,String] aa_dels_csv       = read_json("NEXTCLADE_AADELS.json")
+        String             nextclade_version = read_string("VERSION")
+        File               nextalign_msa     = "~{basename}.nextalign.msa.fasta"
+        File               nextclade_json    = "~{basename}.nextclade.json"
+        File               auspice_json      = "~{basename}.nextclade.auspice.json"
+        File               nextclade_tsv     = "~{basename}.nextclade.tsv"
+        Int                max_ram_gb        = ceil(read_float("MEM_BYTES")/1000000000)
+        Int                runtime_sec       = ceil(read_float("UPTIME_SEC"))
+        String             cpu_load          = read_string("CPU_LOAD")
     }
 }
 
@@ -154,6 +193,7 @@ task pangolin_one_sample {
             ~{"--min-length " + min_length} \
             ~{"--max-ambig " + max_ambig} \
             --alignment \
+            --threads $(nproc) \
             --verbose
 
         cp sequences.aln.fasta "~{basename}.pangolin_msa.fasta"
@@ -198,6 +238,99 @@ task pangolin_one_sample {
     }
 }
 
+task pangolin_many_samples {
+    meta {
+        description: "Pangolin classification of multiple SARS-CoV-2 samples."
+    }
+    input {
+        Array[File]+ genome_fastas
+        Int?         min_length
+        Float?       max_ambig
+        Boolean      inference_usher=true
+        String       basename
+        String       docker = "quay.io/staphb/pangolin:3.1.11-pangolearn-2021-09-17"
+    }
+    command <<<
+        date | tee DATE
+        conda list -n pangolin | grep "usher" | awk -F ' +' '{print$1, $2}' | tee VERSION_PANGO_USHER
+        set -ex
+        pangolin -v | tee VERSION_PANGOLIN
+        pangolin -pv | tee VERSION_PANGOLEARN
+        pangolin --all-versions | tr '\n' ';' | cut -f -5 -d ';' | tee VERSION_PANGOLIN_ALL
+
+        cat ~{sep=" " genome_fastas} > unaligned.fasta
+        pangolin unaligned.fasta \
+            ~{true='--usher' false='' inference_usher} \
+            --outfile "~{basename}.pangolin_report.csv" \
+            ~{"--min-length " + min_length} \
+            ~{"--max-ambig " + max_ambig} \
+            --alignment \
+            --threads $(nproc) \
+            --verbose
+
+        cp sequences.aln.fasta "~{basename}.pangolin_msa.fasta"
+        python3 <<CODE
+        import csv, json
+        #grab output values by column header
+        with open("~{basename}.pangolin_report.csv", 'rt') as csv_file:
+            for line in csv.DictReader(csv_file):
+                with open("VERSION", 'wt') as outf:
+                    pangolin_version=line["pangolin_version"]
+                    version=line["version"]
+                    outf.write(f"pangolin {pangolin_version}; {version}")
+                break
+        out_maps = {'lineage':{}, 'conflict':{}, 'note':{}}
+        with open("~{basename}.pangolin_report.csv", 'rt') as csv_file:
+            with open("PANGO_LINEAGE", 'wt') as outf_lineage:
+                with open("PANGOLIN_CONFLICTS", 'wt') as outf_conflicts:
+                    with open("PANGOLIN_NOTES", 'wt') as outf_note:
+                        for row in csv.DictReader(csv_file):
+                            outf_lineage.write('\t'.join([row['taxon'], row['lineage']])+'\n')
+                            outf_conflicts.write('\t'.join([row['taxon'], row['conflict']])+'\n')
+                            outf_note.write('\t'.join([row['taxon'], row['note']])+'\n')
+                            for k in ('lineage','conflict','note'):
+                                out_maps[k][row['taxon']] = row[k]
+        with open('PANGO_LINEAGE.json', 'wt') as outf:
+            json.dump(out_maps['lineage'], outf)
+        with open('PANGOLIN_CONFLICTS.json', 'wt') as outf:
+            json.dump(out_maps['conflict'], outf)
+        with open('PANGOLIN_NOTES.json', 'wt') as outf:
+            json.dump(out_maps['note'], outf)
+        CODE
+
+        # gather runtime metrics
+        cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+        cat /proc/loadavg > CPU_LOAD
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
+    >>>
+    runtime {
+        docker: docker
+        memory: "14 GB"
+        cpu:    16
+        disks: "local-disk 100 HDD"
+        dx_instance_type: "mem1_ssd1_v2_x16"
+    }
+    output {
+        #Map[String,String] pango_lineage          = read_map("PANGO_LINEAGE")
+        #Map[String,String] pangolin_conflicts     = read_map("PANGOLIN_CONFLICTS")
+        #Map[String,String] pangolin_notes         = read_map("PANGOLIN_NOTES")
+        Map[String,String] pango_lineage          = read_json("PANGO_LINEAGE.json")
+        Map[String,String] pangolin_conflicts     = read_json("PANGOLIN_CONFLICTS.json")
+        Map[String,String] pangolin_notes         = read_json("PANGOLIN_NOTES.json")
+        String             date                   = read_string("DATE")
+        String             version                = read_string("VERSION")
+        String             pangolin_docker        = docker
+        String             pangolin_versions      = read_string("VERSION_PANGOLIN_ALL")
+        String             pangolin_usher_version = read_string("VERSION_PANGO_USHER")
+        String             pangolin_version       = read_string("VERSION_PANGOLIN")
+        String             pangolearn_version     = read_string("VERSION_PANGOLEARN")
+        File               pango_lineage_report   = "${basename}.pangolin_report.csv"
+        File               msa_fasta              = "~{basename}.pangolin_msa.fasta"
+        Int                max_ram_gb             = ceil(read_float("MEM_BYTES")/1000000000)
+        Int                runtime_sec            = ceil(read_float("UPTIME_SEC"))
+        String             cpu_load               = read_string("CPU_LOAD")
+    }
+}
 
 task sequencing_report {
     meta {

diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
@@ -252,6 +252,7 @@ task tsv_join {
     Array[File]+   input_tsvs
     String         id_col
     String         out_basename = "merged"
+    String         out_suffix = ".txt"
   }
 
   command <<<
@@ -291,15 +292,15 @@ task tsv_join {
     out_ids = list(collections.OrderedDict(((i,0) for i in out_ids)).keys())
 
     # write output
-    with open(out_basename+'.txt', 'w', newline='') as outf:
+    with open(out_basename+'~{out_suffix}', 'w', newline='') as outf:
         writer = csv.DictWriter(outf, header, delimiter='\t', dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
         writer.writeheader()
         writer.writerows(out_row_by_id[row_id] for row_id in out_ids)
     CODE
   >>>
 
   output {
-    File out_tsv = "${out_basename}.txt"
+    File out_tsv = "~{out_basename}~{out_suffix}"
   }
 
   runtime {
@@ -499,3 +500,62 @@ task s3_copy {
     disks: "local-disk 1000 HDD"
   }
 }
+
+task filter_sequences_by_length {
+    meta {
+        description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases."
+    }
+    input {
+        File   sequences_fasta
+        Int    min_non_N = 1
+
+        String docker = "quay.io/broadinstitute/viral-core:2.1.32"
+    }
+    parameter_meta {
+        sequences_fasta: {
+          description: "Set of sequences in fasta format",
+          patterns: ["*.fasta", "*.fa"]
+        }
+        min_non_N: {
+          description: "Minimum number of called bases (non-N, non-gap, A, T, C, G, and other non-N ambiguity codes accepted)"
+        }
+    }
+    String out_fname = sub(basename(sequences_fasta), ".fasta", ".filtered.fasta")
+    command <<<
+    python3 <<CODE
+    import Bio.SeqIO
+    import gzip
+    n_total = 0
+    n_kept = 0
+    open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
+    with open_or_gzopen('~{sequences_fasta}', 'rt') as inf:
+        with open_or_gzopen('~{out_fname}', 'wt') as outf:
+            for seq in Bio.SeqIO.parse(inf, 'fasta'):
+                n_total += 1
+                ungapseq = seq.seq.ungap().upper()
+                if (len(ungapseq) - ungapseq.count('N')) >= ~{min_non_N}:
+                    n_kept += 1
+                    Bio.SeqIO.write(seq, outf, 'fasta')
+    n_dropped = n_total-n_kept
+    with open('IN_COUNT', 'wt') as outf:
+        outf.write(str(n_total)+'\n')
+    with open('OUT_COUNT', 'wt') as outf:
+        outf.write(str(n_kept)+'\n')
+    with open('DROP_COUNT', 'wt') as outf:
+        outf.write(str(n_dropped)+'\n')
+    CODE
+    >>>
+    runtime {
+        docker: docker
+        memory: "1 GB"
+        cpu :   1
+        disks:  "local-disk 300 HDD"
+        dx_instance_type: "mem1_ssd1_v2_x2"
+    }
+    output {
+        File filtered_fasta    = out_fname
+        Int  sequences_in      = read_int("IN_COUNT")
+        Int  sequences_dropped = read_int("DROP_COUNT")
+        Int  sequences_out     = read_int("OUT_COUNT")
+    }
+}
diff --git a/pipes/WDL/workflows/augur_from_assemblies.wdl b/pipes/WDL/workflows/augur_from_assemblies.wdl
@@ -86,7 +86,7 @@ workflow augur_from_assemblies {
             infiles     = assembly_fastas,
             output_name = "all_samples_combined_assembly.fasta"
     }
-    call nextstrain.filter_sequences_by_length {
+    call utils.filter_sequences_by_length {
         input:
             sequences_fasta = zcat.combined,
             min_non_N       = min_unambig_genome

diff --git a/pipes/WDL/workflows/mafft_and_snp.wdl b/pipes/WDL/workflows/mafft_and_snp.wdl
@@ -36,7 +36,7 @@ workflow mafft_and_snp {
             infiles     = assembly_fastas,
             output_name = "all_samples_combined_assembly.fasta.gz"
     }
-    call nextstrain.filter_sequences_by_length {
+    call utils.filter_sequences_by_length {
         input:
             sequences_fasta = zcat.combined,
             min_non_N       = min_unambig_genome

diff --git a/pipes/WDL/workflows/mafft_and_snp_annotated.wdl b/pipes/WDL/workflows/mafft_and_snp_annotated.wdl
@@ -37,7 +37,7 @@ workflow mafft_and_snp_annotated {
             infiles     = assembly_fastas,
             output_name = "all_samples_combined_assembly.fasta.gz"
     }
-    call nextstrain.filter_sequences_by_length {
+    call utils.filter_sequences_by_length {
         input:
             sequences_fasta = zcat.combined,
             min_non_N       = min_unambig_genome