From 03cfdecc5d2446db3903739a102cb8e83e8861a2 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 09:20:43 -0500
Subject: [PATCH 01/29] fix dxWDL namespace clash error

---
 .dockstore.yml                                                | 4 ++--
 .../workflows/{read_depths.wdl => calc_bam_read_depths.wdl}   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename pipes/WDL/workflows/{read_depths.wdl => calc_bam_read_depths.wdl} (95%)

diff --git a/.dockstore.yml b/.dockstore.yml
index eba30d3a5..00d98dd64 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -279,9 +279,9 @@ workflows:
     primaryDescriptorPath: /pipes/WDL/workflows/sarscov2_lineages.wdl
     testParameterFiles:
     - /test/input/WDL/test_inputs-sarscov2_lineages-local.json
-  - name: read_depths
+  - name: calc_bam_read_depths
     subclass: WDL
-    primaryDescriptorPath: /pipes/WDL/workflows/read_depths.wdl
+    primaryDescriptorPath: /pipes/WDL/workflows/calc_bam_read_depths.wdl
     testParameterFiles:
       - empty.json
   - name: sarscov2_gisaid_ingest
diff --git a/pipes/WDL/workflows/read_depths.wdl b/pipes/WDL/workflows/calc_bam_read_depths.wdl
similarity index 95%
rename from pipes/WDL/workflows/read_depths.wdl
rename to pipes/WDL/workflows/calc_bam_read_depths.wdl
index 19d9c056e..9f0479285 100644
--- a/pipes/WDL/workflows/read_depths.wdl
+++ b/pipes/WDL/workflows/calc_bam_read_depths.wdl
@@ -2,7 +2,7 @@ version 1.0
 
 import "../tasks/tasks_read_utils.wdl" as read_utils
 
-workflow read_depths {
+workflow calc_bam_read_depths {
     meta {
         description: "Generates read depth tables."
         author: "Broad Viral Genomics"

From e0a409d863d4dceb63fa3f2c242310207927f01d Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 09:28:25 -0500
Subject: [PATCH 02/29] drop unused optional param

---
 pipes/WDL/tasks/tasks_assembly.wdl      | 30 ++++++++++---------------
 pipes/WDL/workflows/classify_multi.wdl  |  1 -
 pipes/WDL/workflows/classify_single.wdl |  1 -
 pipes/WDL/workflows/contigs.wdl         |  1 -
 pipes/WDL/workflows/demux_metag.wdl     |  1 -
 pipes/WDL/workflows/demux_plus.wdl      |  1 -
 6 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl
index 3c625f5a7..b02640c66 100644
--- a/pipes/WDL/tasks/tasks_assembly.wdl
+++ b/pipes/WDL/tasks/tasks_assembly.wdl
@@ -9,7 +9,6 @@ task assemble {
       Int      spades_min_contig_len = 0
       String?  spades_options
       
-      String   assembler = "spades"
       Boolean  always_succeed = false
       
       # do this in two steps in case the input doesn't actually have "taxfilt" in the name
@@ -30,28 +29,23 @@ task assemble {
 
         assembly.py --version | tee VERSION
 
-        if [[ "~{assembler}" == "spades" ]]; then
-          assembly.py assemble_spades \
-            ~{reads_unmapped_bam} \
-            ~{trim_clip_db} \
-            ~{sample_name}.assembly1-~{assembler}.fasta \
-            ~{'--nReads=' + spades_n_reads} \
-            ~{true="--alwaysSucceed" false="" always_succeed} \
-            ~{'--minContigLen=' + spades_min_contig_len} \
-            ~{'--spadesOpts="' + spades_options + '"'} \
-            --memLimitGb $mem_in_gb \
-            --outReads=~{sample_name}.subsamp.bam \
-            --loglevel=DEBUG
-        else
-          echo "unrecognized assembler ~{assembler}" >&2
-          exit 1
-        fi
+        assembly.py assemble_spades \
+          ~{reads_unmapped_bam} \
+          ~{trim_clip_db} \
+          ~{sample_name}.assembly1-spades.fasta \
+          ~{'--nReads=' + spades_n_reads} \
+          ~{true="--alwaysSucceed" false="" always_succeed} \
+          ~{'--minContigLen=' + spades_min_contig_len} \
+          ~{'--spadesOpts="' + spades_options + '"'} \
+          --memLimitGb $mem_in_gb \
+          --outReads=~{sample_name}.subsamp.bam \
+          --loglevel=DEBUG
 
         samtools view -c ~{sample_name}.subsamp.bam | tee subsample_read_count >&2
     }
 
     output {
-        File   contigs_fasta        = "~{sample_name}.assembly1-~{assembler}.fasta"
+        File   contigs_fasta        = "~{sample_name}.assembly1-spades.fasta"
         File   subsampBam           = "~{sample_name}.subsamp.bam"
         Int    subsample_read_count = read_int("subsample_read_count")
         String viralngs_version     = read_string("VERSION")
diff --git a/pipes/WDL/workflows/classify_multi.wdl b/pipes/WDL/workflows/classify_multi.wdl
index 507474dd1..dbb45ebcf 100644
--- a/pipes/WDL/workflows/classify_multi.wdl
+++ b/pipes/WDL/workflows/classify_multi.wdl
@@ -101,7 +101,6 @@ workflow classify_multi {
         }
         call assembly.assemble as spades {
             input:
-                assembler          = "spades",
                 reads_unmapped_bam = rmdup_ubam.dedup_bam,
                 trim_clip_db       = trim_clip_db,
                 always_succeed     = true
diff --git a/pipes/WDL/workflows/classify_single.wdl b/pipes/WDL/workflows/classify_single.wdl
index 898c89560..e26c05b34 100644
--- a/pipes/WDL/workflows/classify_single.wdl
+++ b/pipes/WDL/workflows/classify_single.wdl
@@ -93,7 +93,6 @@ workflow classify_single {
     }
     call assembly.assemble as spades {
         input:
-            assembler          = "spades",
             reads_unmapped_bam = rmdup_ubam.dedup_bam,
             trim_clip_db       = trim_clip_db,
             always_succeed     = true
diff --git a/pipes/WDL/workflows/contigs.wdl b/pipes/WDL/workflows/contigs.wdl
index 0f7312be5..08438e337 100644
--- a/pipes/WDL/workflows/contigs.wdl
+++ b/pipes/WDL/workflows/contigs.wdl
@@ -31,7 +31,6 @@ workflow contigs {
 
     call assembly.assemble as spades {
         input:
-            assembler          = "spades",
             reads_unmapped_bam = rmdup_ubam.dedup_bam
     }
 
diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl
index 575602c4c..f69c2b2a6 100644
--- a/pipes/WDL/workflows/demux_metag.wdl
+++ b/pipes/WDL/workflows/demux_metag.wdl
@@ -43,7 +43,6 @@ workflow demux_metag {
         }
         call assembly.assemble as spades {
             input:
-                assembler          = "spades",
                 reads_unmapped_bam = rmdup_ubam.dedup_bam,
                 trim_clip_db       = trim_clip_db,
                 always_succeed     = true
diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl
index 19a8c0f5e..69e276af3 100644
--- a/pipes/WDL/workflows/demux_plus.wdl
+++ b/pipes/WDL/workflows/demux_plus.wdl
@@ -41,7 +41,6 @@ workflow demux_plus {
         }
         call assembly.assemble as spades {
             input:
-                assembler          = "spades",
                 reads_unmapped_bam = deplete.cleaned_bam,
                 trim_clip_db       = trim_clip_db,
                 always_succeed     = true

From d51e192b48b0036fb4911b2d2fa2a78e9145944c Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 09:34:50 -0500
Subject: [PATCH 03/29] expose min_contig_len at scaffolding step

---
 pipes/WDL/tasks/tasks_assembly.wdl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl
index b02640c66..880c60fbb 100644
--- a/pipes/WDL/tasks/tasks_assembly.wdl
+++ b/pipes/WDL/tasks/tasks_assembly.wdl
@@ -6,7 +6,7 @@ task assemble {
       File     trim_clip_db
       
       Int      spades_n_reads = 10000000
-      Int      spades_min_contig_len = 0
+      Int?     spades_min_contig_len
       String?  spades_options
       
       Boolean  always_succeed = false
@@ -77,6 +77,7 @@ task scaffold {
       Int?         nucmer_max_gap
       Int?         nucmer_min_match
       Int?         nucmer_min_cluster
+      Int?         scaffold_min_contig_len
       Float?       scaffold_min_pct_contig_aligned
 
       Int?         machine_mem_gb
@@ -100,6 +101,7 @@ task scaffold {
           ~{contigs_fasta} \
           ~{sep=' ' reference_genome_fasta} \
           ~{sample_name}.intermediate_scaffold.fasta \
+          ~{'--min_contig_len=' + scaffold_min_contig_len} \
           ~{'--maxgap=' + nucmer_max_gap} \
           ~{'--minmatch=' + nucmer_min_match} \
           ~{'--mincluster=' + nucmer_min_cluster} \

From 2de740d907e86bcc597354fe18956d7c88694b4e Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 10:23:26 -0500
Subject: [PATCH 04/29] alter scaffolding_chosen_ref_name -> _names and make it
 an Array

---
 pipes/WDL/tasks/tasks_assembly.wdl          | 4 ++--
 pipes/WDL/workflows/assemble_denovo.wdl     | 2 +-
 pipes/WDL/workflows/metagenomic_denovo.wdl  | 2 +-
 pipes/WDL/workflows/scaffold_and_refine.wdl | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl
index 880c60fbb..8ef66a37f 100644
--- a/pipes/WDL/tasks/tasks_assembly.wdl
+++ b/pipes/WDL/tasks/tasks_assembly.wdl
@@ -111,7 +111,7 @@ task scaffold {
           --outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \
           --loglevel=DEBUG
 
-        grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ~{sample_name}.scaffolding_chosen_ref.txt
+        grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > ~{sample_name}.scaffolding_chosen_refs.txt
 
         assembly.py gapfill_gap2seq \
           ~{sample_name}.intermediate_scaffold.fasta \
@@ -142,7 +142,7 @@ task scaffold {
         File   intermediate_gapfill_fasta            = "~{sample_name}.intermediate_gapfill.fasta"
         Int    assembly_preimpute_length             = read_int("assembly_preimpute_length")
         Int    assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
-        String scaffolding_chosen_ref_name           = read_string("~{sample_name}.scaffolding_chosen_ref.txt")
+        Array[String] scaffolding_chosen_ref_names   = read_lines("~{sample_name}.scaffolding_chosen_refs.txt")
         File   scaffolding_chosen_ref                = "~{sample_name}.scaffolding_chosen_ref.fasta"
         File   scaffolding_stats                     = "~{sample_name}.scaffolding_stats.txt"
         File   scaffolding_alt_contigs               = "~{sample_name}.scaffolding_alt_contigs.fasta"
diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl
index a9ae94021..7ba5b5be1 100644
--- a/pipes/WDL/workflows/assemble_denovo.wdl
+++ b/pipes/WDL/workflows/assemble_denovo.wdl
@@ -129,7 +129,7 @@ workflow assemble_denovo {
     File    intermediate_gapfill_fasta            = scaffold.intermediate_gapfill_fasta
     Int     assembly_preimpute_length             = scaffold.assembly_preimpute_length
     Int     assembly_preimpute_length_unambiguous = scaffold.assembly_preimpute_length_unambiguous
-    String  scaffolding_chosen_ref_name           = scaffold.scaffolding_chosen_ref_name
+    Array[String]  scaffolding_chosen_ref_names   = scaffold.scaffolding_chosen_ref_names
     File    scaffolding_stats                     = scaffold.scaffolding_stats
     File    scaffolding_alt_contigs               = scaffold.scaffolding_alt_contigs
 
diff --git a/pipes/WDL/workflows/metagenomic_denovo.wdl b/pipes/WDL/workflows/metagenomic_denovo.wdl
index 9abdd116e..f73a9aa4b 100644
--- a/pipes/WDL/workflows/metagenomic_denovo.wdl
+++ b/pipes/WDL/workflows/metagenomic_denovo.wdl
@@ -235,7 +235,7 @@ workflow metagenomic_denovo {
     File    intermediate_gapfill_fasta            = scaffold.intermediate_gapfill_fasta
     Int     assembly_preimpute_length             = scaffold.assembly_preimpute_length
     Int     assembly_preimpute_length_unambiguous = scaffold.assembly_preimpute_length_unambiguous
-    String  scaffolding_chosen_ref_name           = scaffold.scaffolding_chosen_ref_name
+    Array[String]  scaffolding_chosen_ref_names   = scaffold.scaffolding_chosen_ref_names
     File    scaffolding_stats                     = scaffold.scaffolding_stats
     File    scaffolding_alt_contigs               = scaffold.scaffolding_alt_contigs
 
diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl
index b7bfc46aa..adb4411df 100644
--- a/pipes/WDL/workflows/scaffold_and_refine.wdl
+++ b/pipes/WDL/workflows/scaffold_and_refine.wdl
@@ -40,7 +40,7 @@ workflow scaffold_and_refine {
     File   intermediate_gapfill_fasta            = scaffold.intermediate_gapfill_fasta
     Int    assembly_preimpute_length             = scaffold.assembly_preimpute_length
     Int    assembly_preimpute_length_unambiguous = scaffold.assembly_preimpute_length_unambiguous
-    String scaffolding_chosen_ref_name           = scaffold.scaffolding_chosen_ref_name
+    Array[String]  scaffolding_chosen_ref_names  = scaffold.scaffolding_chosen_ref_names
     File   scaffolding_stats                     = scaffold.scaffolding_stats
     File   scaffolding_alt_contigs               = scaffold.scaffolding_alt_contigs
 

From ddf4688ec8b2e3bff62e8d358774cfd7a22e5c17 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 12:24:24 -0500
Subject: [PATCH 05/29] dust off genbank workflow to do more work for you

---
 pipes/WDL/tasks/tasks_ncbi_tools.wdl | 29 ++++++++++++
 pipes/WDL/workflows/genbank.wdl      | 66 ++++++++++++++++------------
 2 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
index 4bbcea347..8605181dd 100644
--- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
@@ -145,6 +145,35 @@ task Fetch_SRA_to_BAM {
     }
 }
 
+task fetch_genbank_metadata {
+    input {
+        String  genbank_accession
+        String  docker = "quay.io/broadinstitute/ncbi-tools:2.10.7.10"
+    }
+    Int disk_size = 50
+    command <<<
+        set -e
+        esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json  > gb.json
+        jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]| {(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > metadata.json
+        jq -r '.db_xref' meta.json | grep ^taxon: | cut -f 2 -d : > taxid.txt
+        jq -r '.organism' meta.json > organism.txt
+    >>>
+    output {
+        Map[String,String] metadata = read_json("metadata.json")
+        String taxid = read_string("taxid.txt")
+        String organism = read_string("organism.txt")
+    }
+    runtime {
+        cpu:     1
+        memory:  "1 GB"
+        disks:   "local-disk " + disk_size + " LOCAL"
+        disk:    disk_size + " GB" # TES
+        dx_instance_type: "mem1_ssd1_v2_x2"
+        docker:  docker
+        maxRetries: 2
+    }
+}
+
 task biosample_tsv_filter_preexisting {
     input {
         File           meta_submit_tsv
diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
index 3c8a183a8..5d6a1bd99 100644
--- a/pipes/WDL/workflows/genbank.wdl
+++ b/pipes/WDL/workflows/genbank.wdl
@@ -1,7 +1,8 @@
 version 1.0
 
-import "../tasks/tasks_interhost.wdl" as interhost
 import "../tasks/tasks_ncbi.wdl" as ncbi
+import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
+import "../tasks/tasks_reports.wdl" as reports
 
 workflow genbank {
 
@@ -13,20 +14,18 @@ workflow genbank {
     }
 
     input {
-        Array[File]+  reference_fastas
-        Array[File]+  reference_feature_tables
-        Array[File]+  assemblies_fasta
+        Array[File]+   assemblies_fasta
+        Array[File]+   alignments_bams
+        Array[String]+ reference_accessions
 
-        String?       author_list # of the form "Lastname,A.B., Lastname,C.,"; optional alternative to names in author_sbt_defaults_yaml
+        String        email_address # required for fetching data from NCBI APIs
+        String        author_list # of the form "Lastname,A.B., Lastname,C.,"; optional alternative to names in author_sbt_defaults_yaml
         File          author_sbt_defaults_yaml # defaults to fill in for author_sbt file (including both author and non-author fields)
         File          author_sbt_j2_template
         File          biosample_attributes
-        Int           taxid
-        File?         coverage_table
-        String?       sequencingTech
+        String        sequencingTech
         String?       comment
-        String?       organism
-        String?       molType='cRNA'
+        String        molType='cRNA'
     }
 
     parameter_meta {
@@ -34,14 +33,10 @@ workflow genbank {
           description: "Genomes to prepare for Genbank submission. One file per genome: all segments/chromosomes included in one file. All fasta files must contain exactly the same number of sequences as reference_fasta (which must equal the number of files in reference_annot_tbl).",
           patterns: ["*.fasta"]
         }
-        reference_fastas: {
-          description: "Reference genome, each segment/chromosome in a separate fasta file, in the exact same count and order as the segments/chromosomes described in genome_fasta. Headers must be Genbank accessions.",
+        reference_accessions: {
+          description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in genome_fasta.",
           patterns: ["*.fasta"]
         }
-        reference_feature_tables: {
-          description: "NCBI Genbank feature table, each segment/chromosome in a separate TBL file, in the exact same count and order as the segments/chromosomes described in genome_fasta and reference_fastas. Accession numbers in the TBL files must correspond exactly to those in reference_fasta.",
-          patterns: ["*.tbl"]
-        }
         author_list: {
           description: "A string containing a space-delimited list with of author surnames separated by first name and (optional) middle initial. Ex. 'Lastname,Firstname, Last-hypenated,First,M., Last,F.'"
         }
@@ -56,9 +51,6 @@ workflow genbank {
           description: "A post-submission attributes file from NCBI BioSample, which is available at https://submit.ncbi.nlm.nih.gov/subs/ and clicking on 'Download attributes file with BioSample accessions'.",
           patterns: ["*.txt", "*.tsv"]
         }
-        taxid: {
-          description: "The NCBI taxonomy ID for the species being submitted in this batch (all sequences in this batch must belong to the same taxid). https://www.ncbi.nlm.nih.gov/taxonomy/"
-        }
         coverage_table: {
           description: "A two column tab text file mapping sample IDs (first column) to average sequencing coverage (second column, floating point number).",
           patterns: ["*.txt", "*.tsv"],
@@ -68,10 +60,6 @@ workflow genbank {
           description: "The type of sequencer used to generate reads. NCBI has a controlled vocabulary for this value which can be found here: https://submit.ncbi.nlm.nih.gov/structcomment/nongenomes/",
           category: "common"
         }
-        organism: {
-          description: "The scientific name for the organism being submitted. This is typically the species name and should match the name given by the NCBI Taxonomy database. For more info, see: https://www.ncbi.nlm.nih.gov/Sequin/sequin.hlp.html#Organism",
-          category: "common"
-        }
         molType: {
           description: "The type of molecule being described. This defaults to 'cRNA' as this pipeline is most commonly used for viral submissions, but any value allowed by the INSDC controlled vocabulary may be used here. Valid values are described at http://www.insdc.org/controlled-vocabulary-moltype-qualifier",
           category: "common"
@@ -82,19 +70,39 @@ workflow genbank {
 
     }
 
+    scatter(segment_acc in reference_accessions) {
+      call ncbi_tools.fetch_genbank_metadata {
+        input:
+          genbank_accession = segment_acc
+      }
+      call ncbi.download_annotations {
+        input:
+          accessions = [segment_acc],
+          emailAddress = email_address,
+          combined_out_prefix = segment_acc
+      }
+    }
+
+    call reports.coverage_report {
+      input:
+        mapped_bams = alignments_bams,
+        mapped_bam_idx = []
+    }
+
     call ncbi.biosample_to_genbank {
         input:
             biosample_attributes = biosample_attributes,
-            num_segments         = length(reference_fastas),
-            taxid                = taxid
+            num_segments         = length(reference_accessions),
+            taxid                = fetch_genbank_metadata.taxid[0],
+            s_dropout_note       = false
     }
 
     scatter(assembly in assemblies_fasta) {
         call ncbi.align_and_annot_transfer_single as annot {
             input:
                 genome_fasta             = assembly,
-                reference_fastas         = reference_fastas,
-                reference_feature_tables = reference_feature_tables
+                reference_fastas         = flatten(download_annotations.genomes_fasta),
+                reference_feature_tables = flatten(download_annotations.features_tbl)
         }
     }
 
@@ -112,10 +120,10 @@ workflow genbank {
             authors_sbt        = generate_author_sbt.sbt_file,
             biosampleMap       = biosample_to_genbank.biosample_map,
             genbankSourceTable = biosample_to_genbank.genbank_source_modifier_table,
-            coverage_table     = coverage_table,
+            coverage_table     = coverage_report.coverage_report,
             sequencingTech     = sequencingTech,
             comment            = comment,
-            organism           = organism,
+            organism           = fetch_genbank_metadata.organism[0],
             molType            = molType
     }
 

From cbd49eb4a97e39b5c8a718646afd6dbbda73f6ae Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 13:38:46 -0500
Subject: [PATCH 06/29] allow old coverage table optionally

---
 pipes/WDL/workflows/genbank.wdl               | 16 ++++++++++------
 test/input/WDL/test_inputs-genbank-local.json | 12 ++----------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
index 5d6a1bd99..5190141b8 100644
--- a/pipes/WDL/workflows/genbank.wdl
+++ b/pipes/WDL/workflows/genbank.wdl
@@ -15,9 +15,11 @@ workflow genbank {
 
     input {
         Array[File]+   assemblies_fasta
-        Array[File]+   alignments_bams
         Array[String]+ reference_accessions
 
+        Array[File]   alignments_bams
+        File?         coverage_table
+
         String        email_address # required for fetching data from NCBI APIs
         String        author_list # of the form "Lastname,A.B., Lastname,C.,"; optional alternative to names in author_sbt_defaults_yaml
         File          author_sbt_defaults_yaml # defaults to fill in for author_sbt file (including both author and non-author fields)
@@ -83,10 +85,12 @@ workflow genbank {
       }
     }
 
-    call reports.coverage_report {
-      input:
-        mapped_bams = alignments_bams,
-        mapped_bam_idx = []
+    if(length(alignments_bams)>0) {
+      call reports.coverage_report {
+        input:
+          mapped_bams = alignments_bams,
+          mapped_bam_idx = []
+      }
     }
 
     call ncbi.biosample_to_genbank {
@@ -120,7 +124,7 @@ workflow genbank {
             authors_sbt        = generate_author_sbt.sbt_file,
             biosampleMap       = biosample_to_genbank.biosample_map,
             genbankSourceTable = biosample_to_genbank.genbank_source_modifier_table,
-            coverage_table     = coverage_report.coverage_report,
+            coverage_table     = select_first([coverage_report.coverage_report, coverage_table]),
             sequencingTech     = sequencingTech,
             comment            = comment,
             organism           = fetch_genbank_metadata.organism[0],
diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json
index b310a7ef3..9be3c48ba 100644
--- a/test/input/WDL/test_inputs-genbank-local.json
+++ b/test/input/WDL/test_inputs-genbank-local.json
@@ -1,13 +1,8 @@
 {
   "genbank.molType": "cRNA",
+  "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
-  "genbank.reference_feature_tables": [
-    "test/input/genbank/KM821997.1.tbl",
-    "test/input/genbank/KM821998.1.tbl"
-  ],
-  "genbank.organism": "Lassa mammarenavirus",
   "genbank.sequencingTech": "Illumina MiSeq",
-  "genbank.taxid": 11620,
   "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt",
   "genbank.prep_genbank.assembly_method": "placeholder assembly software",
   "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1",
@@ -20,9 +15,6 @@
     "test/input/LASV_NGA_2018_0097.fasta",
     "test/input/LASV_NGA_2018_0541.fasta"
   ],
-  "genbank.reference_fastas": [
-    "test/input/genbank/KM821997.1.fasta",
-    "test/input/genbank/KM821998.1.fasta"
-  ]
+  "genbank.email_address": "viral-ngs@broadinstitute.org"
 }
 

From 5845dae9ff907183dc722f7bf8d16c344ec8db6f Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 15:02:46 -0500
Subject: [PATCH 07/29] update both input files

---
 .../cromwell-local/test_inputs-genbank-local.json    | 10 ++--------
 .../WDL/miniwdl-local/test_inputs-genbank-local.json | 12 ++----------
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
index cdab9805b..907a8de33 100644
--- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
@@ -1,12 +1,8 @@
 {
   "genbank.molType": "cRNA",
   "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt",
-  "genbank.reference_feature_tables": [
-    "test/input/genbank/MN908947.3.tbl"
-  ],
-  "genbank.organism": "Severe acute respiratory syndrome coronavirus 2",
+  "genbank.reference_accessions": ["MN908947.3"],
   "genbank.sequencingTech": "Illumina NovaSeq",
-  "genbank.taxid": 2697049,
   "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt",
   "genbank.prep_genbank.assembly_method": "placeholder assembly software",
   "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1",
@@ -19,8 +15,6 @@
     "test/input/MA_MGH_00004.fasta",
     "test/input/MA_MGH_00005.fasta"
   ],
-  "genbank.reference_fastas": [
-    "test/input/genbank/MN908947.3.fasta"
-  ]
+  "genbank.email_address": "viral-ngs@broadinstitute.org"
 }
 
diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
index b310a7ef3..9be3c48ba 100644
--- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
@@ -1,13 +1,8 @@
 {
   "genbank.molType": "cRNA",
+  "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
-  "genbank.reference_feature_tables": [
-    "test/input/genbank/KM821997.1.tbl",
-    "test/input/genbank/KM821998.1.tbl"
-  ],
-  "genbank.organism": "Lassa mammarenavirus",
   "genbank.sequencingTech": "Illumina MiSeq",
-  "genbank.taxid": 11620,
   "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt",
   "genbank.prep_genbank.assembly_method": "placeholder assembly software",
   "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1",
@@ -20,9 +15,6 @@
     "test/input/LASV_NGA_2018_0097.fasta",
     "test/input/LASV_NGA_2018_0541.fasta"
   ],
-  "genbank.reference_fastas": [
-    "test/input/genbank/KM821997.1.fasta",
-    "test/input/genbank/KM821998.1.fasta"
-  ]
+  "genbank.email_address": "viral-ngs@broadinstitute.org"
 }
 

From 60825459fb5e2688b955f846e899b8689e6d60ad Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 16:39:27 -0500
Subject: [PATCH 08/29] test with empty input

---
 test/input/WDL/cromwell-local/test_inputs-genbank-local.json | 1 +
 test/input/WDL/miniwdl-local/test_inputs-genbank-local.json  | 1 +
 test/input/WDL/test_inputs-genbank-local.json                | 1 +
 3 files changed, 3 insertions(+)

diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
index 907a8de33..e6af063fe 100644
--- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
@@ -1,6 +1,7 @@
 {
   "genbank.molType": "cRNA",
   "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt",
+  "genbank.alignments_bams": [],
   "genbank.reference_accessions": ["MN908947.3"],
   "genbank.sequencingTech": "Illumina NovaSeq",
   "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt",
diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
index 9be3c48ba..1c938870e 100644
--- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
@@ -2,6 +2,7 @@
   "genbank.molType": "cRNA",
   "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
+  "genbank.alignments_bams": [],
   "genbank.sequencingTech": "Illumina MiSeq",
   "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt",
   "genbank.prep_genbank.assembly_method": "placeholder assembly software",
diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json
index 9be3c48ba..1c938870e 100644
--- a/test/input/WDL/test_inputs-genbank-local.json
+++ b/test/input/WDL/test_inputs-genbank-local.json
@@ -2,6 +2,7 @@
   "genbank.molType": "cRNA",
   "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
+  "genbank.alignments_bams": [],
   "genbank.sequencingTech": "Illumina MiSeq",
   "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt",
   "genbank.prep_genbank.assembly_method": "placeholder assembly software",

From 5ce0b814445a01e87c30ad9a2022e001eacc551c Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 19:58:21 -0500
Subject: [PATCH 09/29] bugfixes

---
 pipes/WDL/tasks/tasks_ncbi.wdl       | 1 +
 pipes/WDL/tasks/tasks_ncbi_tools.wdl | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl
index 33b01e8b2..9d6324f01 100644
--- a/pipes/WDL/tasks/tasks_ncbi.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi.wdl
@@ -54,6 +54,7 @@ task download_annotations {
         ./ \
         ${sep=' ' accessions} \
         --combinedFilePrefix "${combined_out_prefix}" \
+        --forceOverwrite \
         --loglevel DEBUG
   }
 
diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
index 8605181dd..1bad0dff2 100644
--- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
@@ -154,12 +154,12 @@ task fetch_genbank_metadata {
     command <<<
         set -e
         esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json  > gb.json
-        jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]| {(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > metadata.json
-        jq -r '.db_xref' meta.json | grep ^taxon: | cut -f 2 -d : > taxid.txt
-        jq -r '.organism' meta.json > organism.txt
+        jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json
+        jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt
+        jq -r '.organism' "~{genbank_accession}".metadata.json > organism.txt
     >>>
     output {
-        Map[String,String] metadata = read_json("metadata.json")
+        Map[String,String] metadata = read_json("~{genbank_accession}.metadata.json")
         String taxid = read_string("taxid.txt")
         String organism = read_string("organism.txt")
     }

From da76be531710d4d8324fdbd0028d7bac7a135753 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Dec 2022 20:07:39 -0500
Subject: [PATCH 10/29] try other output

---
 pipes/WDL/workflows/genbank.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
index 5190141b8..600af4146 100644
--- a/pipes/WDL/workflows/genbank.wdl
+++ b/pipes/WDL/workflows/genbank.wdl
@@ -105,7 +105,7 @@ workflow genbank {
         call ncbi.align_and_annot_transfer_single as annot {
             input:
                 genome_fasta             = assembly,
-                reference_fastas         = flatten(download_annotations.genomes_fasta),
+                reference_fastas         = download_annotations.combined_fasta,
                 reference_feature_tables = flatten(download_annotations.features_tbl)
         }
     }

From 2341f07c5b6900ec8e9aa6ef0a4c5e1a3cd26aaa Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 5 Dec 2022 12:45:49 -0500
Subject: [PATCH 11/29] fix wdl task download_annotations file namespace error
 when combined_out_prefix happens to be an accession; also prevent combined
 fasta from showing up in genomes_fasta

---
 pipes/WDL/tasks/tasks_ncbi.wdl  | 17 +++++++++--------
 pipes/WDL/workflows/genbank.wdl |  3 ++-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl
index 9d6324f01..519a6aa6d 100644
--- a/pipes/WDL/tasks/tasks_ncbi.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi.wdl
@@ -41,25 +41,26 @@ task download_annotations {
     String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
   }
 
-  command {
+  command <<<
     set -ex -o pipefail
     ncbi.py --version | tee VERSION
     ncbi.py fetch_feature_tables \
-        ${emailAddress} \
+        ~{emailAddress} \
         ./ \
-        ${sep=' ' accessions} \
+        ~{sep=' ' accessions} \
         --loglevel DEBUG
+    mkdir -p combined
     ncbi.py fetch_fastas \
-        ${emailAddress} \
+        ~{emailAddress} \
         ./ \
-        ${sep=' ' accessions} \
-        --combinedFilePrefix "${combined_out_prefix}" \
+        ~{sep=' ' accessions} \
+        --combinedFilePrefix "combined/~{combined_out_prefix}" \
         --forceOverwrite \
         --loglevel DEBUG
-  }
+  >>>
 
   output {
-    File        combined_fasta   = "${combined_out_prefix}.fasta"
+    File        combined_fasta   = "~{combined_out_prefix}.fasta"
     Array[File] genomes_fasta    = glob("*.fasta")
     Array[File] features_tbl     = glob("*.tbl")
     String      viralngs_version = read_string("VERSION")
diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
index 600af4146..40cda9a80 100644
--- a/pipes/WDL/workflows/genbank.wdl
+++ b/pipes/WDL/workflows/genbank.wdl
@@ -73,6 +73,7 @@ workflow genbank {
     }
 
     scatter(segment_acc in reference_accessions) {
+      # scatter these calls in order to preserve original order
       call ncbi_tools.fetch_genbank_metadata {
         input:
           genbank_accession = segment_acc
@@ -105,7 +106,7 @@ workflow genbank {
         call ncbi.align_and_annot_transfer_single as annot {
             input:
                 genome_fasta             = assembly,
-                reference_fastas         = download_annotations.combined_fasta,
+                reference_fastas         = flatten(download_annotations.genomes_fasta),
                 reference_feature_tables = flatten(download_annotations.features_tbl)
         }
     }

From 93bd6bf938ea42613df4b69dd4c5f3c7f7a4a427 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 6 Dec 2022 18:43:52 +0100
Subject: [PATCH 12/29] test a fix for miniwdl

---
 pipes/WDL/tasks/tasks_ncbi_tools.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
index 1bad0dff2..00635105d 100644
--- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
@@ -153,6 +153,7 @@ task fetch_genbank_metadata {
     Int disk_size = 50
     command <<<
         set -e
+        source activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners
         esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json  > gb.json
         jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json
         jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt

From 0d32530f8680a1ef39172fec1e3b6e91f324846e Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 6 Dec 2022 19:09:09 +0100
Subject: [PATCH 13/29] try again

---
 pipes/WDL/tasks/tasks_ncbi_tools.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
index 00635105d..5361a63fc 100644
--- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
@@ -153,7 +153,7 @@ task fetch_genbank_metadata {
     Int disk_size = 50
     command <<<
         set -e
-        source activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners
+        activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners
         esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json  > gb.json
         jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json
         jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt

From 4f7464d870c9f98c397bc45895ed1b4ca5d54f1c Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Dec 2022 09:06:23 +0100
Subject: [PATCH 14/29] try activaet conda again

---
 pipes/WDL/tasks/tasks_ncbi_tools.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
index 5361a63fc..64d0328db 100644
--- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl
@@ -153,7 +153,7 @@ task fetch_genbank_metadata {
     Int disk_size = 50
     command <<<
         set -e
-        activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners
+        source /opt/miniconda/bin/activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners
         esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json  > gb.json
         jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json
         jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt

From 61d9a41f980e573aeb7a485bdd6c56db50b25aba Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Dec 2022 14:06:27 -0500
Subject: [PATCH 15/29] make assemble_denovo accept multiple input bams

---
 pipes/WDL/workflows/assemble_denovo.wdl           | 15 ++++++++++++---
 .../test_inputs-assemble_denovo-dnanexus.dx.json  |  2 +-
 .../WDL/test_inputs-assemble_denovo-local.json    |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl
index 7ba5b5be1..4b1fe6dff 100644
--- a/pipes/WDL/workflows/assemble_denovo.wdl
+++ b/pipes/WDL/workflows/assemble_denovo.wdl
@@ -15,7 +15,7 @@ workflow assemble_denovo {
   }
 
   input {
-    File         reads_unmapped_bam
+    Array[File]+ reads_unmapped_bams
 
     Array[File]+ reference_genome_fasta
 
@@ -26,11 +26,11 @@ workflow assemble_denovo {
     File?        filter_to_taxon_db
     File         trim_clip_db
 
-    String       sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
+    String       sample_name = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned")
   }
 
   parameter_meta {
-    raw_reads_unmapped_bam: { description: "unaligned reads in BAM format", patterns: ["*.bam"] }
+    raw_reads_unmapped_bams: { description: "unaligned reads in BAM format", patterns: ["*.bam"] }
     deplete_bmtaggerDbs: {
        description: "Optional list of databases to use for bmtagger-based depletion. Sequences in fasta format will be indexed on the fly, pre-bmtagger-indexed databases may be provided as tarballs.",
        patterns: ["*.fasta", "*.fasta.gz", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
@@ -53,6 +53,15 @@ workflow assemble_denovo {
     }
   }
 
+  if(length(reads_unmapped_bams)>1) {
+      call read_utils.merge_and_reheader_bams as merge_reads {
+          input:
+              in_bams      = reads_unmapped_bams,
+              out_basename = sample_name
+      }
+  }
+  File reads_unmapped_bam = select_first([merge_reads.out_bam, reads_unmapped_bams[0]])
+
   if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) {
     call taxon_filter.deplete_taxa {
       input:
diff --git a/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json b/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json
index 8bf8b18e1..c182224e0 100644
--- a/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json
+++ b/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json
@@ -1,5 +1,5 @@
 {
-	"stage-common.reads_unmapped_bam": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F8F2kVQ09y3Q9Qj14fF806q2" } },
+	"stage-common.reads_unmapped_bams": [ { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F8F2kVQ09y3Q9Qj14fF806q2" } } ],
 
 	"stage-common.deplete_bmtaggerDbs": [
 		{ "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-BYF6g8Q0zjF77x79bGYgJ1Zb" } }
diff --git a/test/input/WDL/test_inputs-assemble_denovo-local.json b/test/input/WDL/test_inputs-assemble_denovo-local.json
index bf4c80a0b..b5c46b913 100644
--- a/test/input/WDL/test_inputs-assemble_denovo-local.json
+++ b/test/input/WDL/test_inputs-assemble_denovo-local.json
@@ -1,5 +1,5 @@
 {
-  "assemble_denovo.reads_unmapped_bam": "test/input/G5012.3.testreads.bam",
+  "assemble_denovo.reads_unmapped_bams": ["test/input/G5012.3.testreads.bam"],
   "assemble_denovo.deplete_blastDbs": [
     "test/input/5kb_human_from_chr6.fasta"
   ],

From 0453414188f3c1d549eca22f01a8fec0154a52ca Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Dec 2022 12:47:40 -0500
Subject: [PATCH 16/29] add unique_strings and unique_arrays to tasks_utils

---
 pipes/WDL/tasks/tasks_utils.wdl | 44 +++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
index 22d8f8e20..3e5701918 100644
--- a/pipes/WDL/tasks/tasks_utils.wdl
+++ b/pipes/WDL/tasks/tasks_utils.wdl
@@ -586,6 +586,50 @@ task rename_file {
   }
 }
 
+task unique_strings {
+  input {
+    Array[String]  strings
+  }
+  Int disk_size = 50
+  command {
+    cat ~{write_lines(strings)} | sort | uniq > UNIQUE_OUT
+  }
+  output {
+    Array[String]  sorted_unique = read_lines("UNIQUE_OUT")
+  }
+  runtime {
+    memory: "1 GB"
+    cpu: 1
+    docker: "ubuntu"
+    disks:  "local-disk " + disk_size + " HDD"
+    disk: disk_size + " GB" # TES
+    dx_instance_type: "mem1_ssd1_v2_x2"
+    maxRetries: 2
+  }
+}
+
+task unique_arrays {
+  input {
+    Array[Array[String]]  string_arrays
+  }
+  Int disk_size = 50
+  command {
+    cat ~{write_tsv(string_arrays)} | sort | uniq > UNIQUE_OUT
+  }
+  output {
+    Array[Array[String]]  sorted_unique = read_tsv("UNIQUE_OUT")
+  }
+  runtime {
+    memory: "1 GB"
+    cpu: 1
+    docker: "ubuntu"
+    disks:  "local-disk " + disk_size + " HDD"
+    disk: disk_size + " GB" # TES
+    dx_instance_type: "mem1_ssd1_v2_x2"
+    maxRetries: 2
+  }
+}
+
 task today {
   input {
     String? timezone

From f3f4ec8b1dff8febf5432296a9b03ef71e1e1bd8 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Dec 2022 13:00:58 -0500
Subject: [PATCH 17/29] add fixed string assembly_method output

---
 pipes/WDL/workflows/assemble_denovo.wdl   | 1 +
 pipes/WDL/workflows/assemble_refbased.wdl | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl
index 4b1fe6dff..569af71bb 100644
--- a/pipes/WDL/workflows/assemble_denovo.wdl
+++ b/pipes/WDL/workflows/assemble_denovo.wdl
@@ -157,6 +157,7 @@ workflow assemble_denovo {
     Int     read_pairs_aligned                    = refine.align_to_self_merged_read_pairs_aligned
     Float   bases_aligned                         = refine.align_to_self_merged_bases_aligned
     
+    String  assembly_method = "viral-ngs/assemble_denovo"
     String? deplete_viral_classify_version        = deplete_taxa.viralngs_version
     String? taxfilt_viral_classify_version        = filter_to_taxon.viralngs_version
     String  assemble_viral_assemble_version       = assemble.viralngs_version
diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl
index 00ee8a445..06069f746 100644
--- a/pipes/WDL/workflows/assemble_refbased.wdl
+++ b/pipes/WDL/workflows/assemble_refbased.wdl
@@ -237,6 +237,7 @@ workflow assemble_refbased {
         Float       align_to_self_merged_mean_coverage           = plot_self_coverage.mean_coverage
         File        align_to_self_isnvs_vcf                      = isnvs_self.report_vcf
         
+        String      assembly_method = "viral-ngs/assemble_refbased"
         String      align_to_ref_viral_core_version              = align_to_ref.viralngs_version[0]
         String      ivar_version                                 = ivar_trim.ivar_version[0]
         String      viral_assemble_version                       = call_consensus.viralngs_version

From 59f1a0eceb6df62667f578f961b16bce0b936569 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Dec 2022 13:08:27 -0500
Subject: [PATCH 18/29] convert reference_accessions to
 reference_accessions_list which is amenable to accepting the output of
 scaffolding_chosen_ref as input

---
 pipes/WDL/tasks/tasks_utils.wdl | 20 ++++++++++++++++++++
 pipes/WDL/workflows/genbank.wdl | 24 ++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
index 3e5701918..f0c5f07e0 100644
--- a/pipes/WDL/tasks/tasks_utils.wdl
+++ b/pipes/WDL/tasks/tasks_utils.wdl
@@ -586,6 +586,26 @@ task rename_file {
   }
 }
 
+task raise {
+  input {
+    String message = "error!"
+  }
+  command {
+    set -e
+    echo "$message"
+    exit 1
+  }
+  runtime {
+    memory: "1 GB"
+    cpu: 1
+    docker: "ubuntu"
+    disks:  "local-disk 30 HDD"
+    disk: "30 GB" # TES
+    dx_instance_type: "mem1_ssd1_v2_x2"
+    maxRetries: 2
+  }
+}
+
 task unique_strings {
   input {
     Array[String]  strings
diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
index 40cda9a80..55792cb56 100644
--- a/pipes/WDL/workflows/genbank.wdl
+++ b/pipes/WDL/workflows/genbank.wdl
@@ -3,6 +3,7 @@ version 1.0
 import "../tasks/tasks_ncbi.wdl" as ncbi
 import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
 import "../tasks/tasks_reports.wdl" as reports
+import "../tasks/tasks_utils.wdl" as utils
 
 workflow genbank {
 
@@ -14,8 +15,8 @@ workflow genbank {
     }
 
     input {
-        Array[File]+   assemblies_fasta
-        Array[String]+ reference_accessions
+        Array[File]+          assemblies_fasta
+        Array[Array[String]]+ reference_accessions_list
 
         Array[File]   alignments_bams
         File?         coverage_table
@@ -35,8 +36,8 @@ workflow genbank {
           description: "Genomes to prepare for Genbank submission. One file per genome: all segments/chromosomes included in one file. All fasta files must contain exactly the same number of sequences as reference_fasta (which must equal the number of files in reference_annot_tbl).",
           patterns: ["*.fasta"]
         }
-        reference_accessions: {
-          description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in genome_fasta.",
+        reference_accessions_list: {
+          description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in assemblies_fasta. This is allowed to be an Array of such accession lists, but if so, all Array[String]s must be identical to each other or an error will be raised.",
           patterns: ["*.fasta"]
         }
         author_list: {
@@ -72,6 +73,20 @@ workflow genbank {
 
     }
 
+    # take a array-array-string of scaffolding_chosen_refs output -> uniqified list of reference_accessions -- fail if not exactly one unique array-string across all array-array-strings
+    call utils.unique_arrays as unique_references {
+      input:
+        string_arrays = reference_accessions_list
+    }
+    if((length(unique_references.sorted_unique) != 1)
+      || length(unique_references.sorted_unique[0]) < 1) {
+      call utils.raise {
+        input:
+          message = "all Array[String] reference accession lists in reference_accessions_list must be identical!"
+      }
+    }
+    Array[String] reference_accessions = unique_references.sorted_unique[0]
+
     scatter(segment_acc in reference_accessions) {
       # scatter these calls in order to preserve original order
       call ncbi_tools.fetch_genbank_metadata {
@@ -94,6 +109,7 @@ workflow genbank {
       }
     }
 
+    # TO DO dpark: if ! defined biosample_attributes, call ncbi_tools.fetch_biosamples on external ids (where do we get external ids?)
     call ncbi.biosample_to_genbank {
         input:
             biosample_attributes = biosample_attributes,

From 872493022df07f5b2738a379538cc603b105bcd5 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Dec 2022 13:45:21 -0500
Subject: [PATCH 19/29] fix test input jsons

---
 test/input/WDL/cromwell-local/test_inputs-genbank-local.json | 2 +-
 test/input/WDL/miniwdl-local/test_inputs-genbank-local.json  | 2 +-
 test/input/WDL/test_inputs-genbank-local.json                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
index e6af063fe..0d0614d01 100644
--- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
@@ -2,7 +2,7 @@
   "genbank.molType": "cRNA",
   "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt",
   "genbank.alignments_bams": [],
-  "genbank.reference_accessions": ["MN908947.3"],
+  "genbank.reference_accessions_list": [["MN908947.3"],["MN908947.3"],["MN908947.3"]],
   "genbank.sequencingTech": "Illumina NovaSeq",
   "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt",
   "genbank.prep_genbank.assembly_method": "placeholder assembly software",
diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
index 1c938870e..b5edf88d4 100644
--- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
@@ -1,6 +1,6 @@
 {
   "genbank.molType": "cRNA",
-  "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
+  "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
   "genbank.alignments_bams": [],
   "genbank.sequencingTech": "Illumina MiSeq",
diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json
index 1c938870e..b5edf88d4 100644
--- a/test/input/WDL/test_inputs-genbank-local.json
+++ b/test/input/WDL/test_inputs-genbank-local.json
@@ -1,6 +1,6 @@
 {
   "genbank.molType": "cRNA",
-  "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
+  "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
   "genbank.alignments_bams": [],
   "genbank.sequencingTech": "Illumina MiSeq",

From 1c3d95fa03305fee14734bdeb5504bac31644d23 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Dec 2022 15:11:18 -0500
Subject: [PATCH 20/29] revert recent change

---
 pipes/WDL/workflows/genbank.wdl               | 24 ++++---------------
 .../test_inputs-genbank-local.json            |  2 +-
 .../test_inputs-genbank-local.json            |  2 +-
 test/input/WDL/test_inputs-genbank-local.json |  2 +-
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
index 55792cb56..ce12e38b6 100644
--- a/pipes/WDL/workflows/genbank.wdl
+++ b/pipes/WDL/workflows/genbank.wdl
@@ -3,7 +3,7 @@ version 1.0
 import "../tasks/tasks_ncbi.wdl" as ncbi
 import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
 import "../tasks/tasks_reports.wdl" as reports
-import "../tasks/tasks_utils.wdl" as utils
+#import "../tasks/tasks_utils.wdl" as utils
 
 workflow genbank {
 
@@ -15,8 +15,8 @@ workflow genbank {
     }
 
     input {
-        Array[File]+          assemblies_fasta
-        Array[Array[String]]+ reference_accessions_list
+        Array[File]+    assemblies_fasta
+        Array[String]+  reference_accessions
 
         Array[File]   alignments_bams
         File?         coverage_table
@@ -36,8 +36,8 @@ workflow genbank {
           description: "Genomes to prepare for Genbank submission. One file per genome: all segments/chromosomes included in one file. All fasta files must contain exactly the same number of sequences as reference_fasta (which must equal the number of files in reference_annot_tbl).",
           patterns: ["*.fasta"]
         }
-        reference_accessions_list: {
-          description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in assemblies_fasta. This is allowed to be an Array of such accession lists, but if so, all Array[String]s must be identical to each other or an error will be raised.",
+        reference_accessions: {
+          description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in assemblies_fasta.",
           patterns: ["*.fasta"]
         }
         author_list: {
@@ -73,20 +73,6 @@ workflow genbank {
 
     }
 
-    # take a array-array-string of scaffolding_chosen_refs output -> uniqified list of reference_accessions -- fail if not exactly one unique array-string across all array-array-strings
-    call utils.unique_arrays as unique_references {
-      input:
-        string_arrays = reference_accessions_list
-    }
-    if((length(unique_references.sorted_unique) != 1)
-      || length(unique_references.sorted_unique[0]) < 1) {
-      call utils.raise {
-        input:
-          message = "all Array[String] reference accession lists in reference_accessions_list must be identical!"
-      }
-    }
-    Array[String] reference_accessions = unique_references.sorted_unique[0]
-
     scatter(segment_acc in reference_accessions) {
       # scatter these calls in order to preserve original order
       call ncbi_tools.fetch_genbank_metadata {
diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
index 0d0614d01..e6af063fe 100644
--- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json
@@ -2,7 +2,7 @@
   "genbank.molType": "cRNA",
   "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt",
   "genbank.alignments_bams": [],
-  "genbank.reference_accessions_list": [["MN908947.3"],["MN908947.3"],["MN908947.3"]],
+  "genbank.reference_accessions": ["MN908947.3"],
   "genbank.sequencingTech": "Illumina NovaSeq",
   "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt",
   "genbank.prep_genbank.assembly_method": "placeholder assembly software",
diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
index b5edf88d4..1c938870e 100644
--- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
+++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json
@@ -1,6 +1,6 @@
 {
   "genbank.molType": "cRNA",
-  "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]],
+  "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
   "genbank.alignments_bams": [],
   "genbank.sequencingTech": "Illumina MiSeq",
diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json
index b5edf88d4..1c938870e 100644
--- a/test/input/WDL/test_inputs-genbank-local.json
+++ b/test/input/WDL/test_inputs-genbank-local.json
@@ -1,6 +1,6 @@
 {
   "genbank.molType": "cRNA",
-  "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]],
+  "genbank.reference_accessions": ["KM821997.1", "KM821998.1"],
   "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt",
   "genbank.alignments_bams": [],
   "genbank.sequencingTech": "Illumina MiSeq",

From 6b7ac475bd23c27c39b5552f5d30c843e8e38d6d Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sat, 10 Dec 2022 16:06:45 -0500
Subject: [PATCH 21/29] ironically the prep_genbank only wants a 2-col coverage
 table

---
 pipes/WDL/workflows/genbank.wdl | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
index ce12e38b6..b2ae25c11 100644
--- a/pipes/WDL/workflows/genbank.wdl
+++ b/pipes/WDL/workflows/genbank.wdl
@@ -3,7 +3,7 @@ version 1.0
 import "../tasks/tasks_ncbi.wdl" as ncbi
 import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
 import "../tasks/tasks_reports.wdl" as reports
-#import "../tasks/tasks_utils.wdl" as utils
+import "../tasks/tasks_utils.wdl" as utils
 
 workflow genbank {
 
@@ -93,6 +93,11 @@ workflow genbank {
           mapped_bams = alignments_bams,
           mapped_bam_idx = []
       }
+      call utils.tsv_drop_cols as coverage_two_col {
+        input:
+          in_tsv = coverage_report.coverage_report,
+          drop_cols = ["aln2self_cov_median", "aln2self_cov_mean_non0", "aln2self_cov_1X", "aln2self_cov_5X", "aln2self_cov_20X", "aln2self_cov_100X"]
+      }
     }
 
     # TO DO dpark: if ! defined biosample_attributes, call ncbi_tools.fetch_biosamples on external ids (where do we get external ids?)
@@ -127,7 +132,7 @@ workflow genbank {
             authors_sbt        = generate_author_sbt.sbt_file,
             biosampleMap       = biosample_to_genbank.biosample_map,
             genbankSourceTable = biosample_to_genbank.genbank_source_modifier_table,
-            coverage_table     = select_first([coverage_report.coverage_report, coverage_table]),
+            coverage_table     = select_first([coverage_two_col.out_tsv, coverage_table]),
             sequencingTech     = sequencingTech,
             comment            = comment,
             organism           = fetch_genbank_metadata.organism[0],

From a581f4d02d8f7166a91502eed6ea40f1f6abe679 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Sun, 11 Dec 2022 23:18:48 -0500
Subject: [PATCH 22/29] add a place for non-filename-friendly sample names to
 come in and rewrite bam headers and fasta headers

---
 pipes/WDL/workflows/assemble_denovo.wdl | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl
index 569af71bb..c202af262 100644
--- a/pipes/WDL/workflows/assemble_denovo.wdl
+++ b/pipes/WDL/workflows/assemble_denovo.wdl
@@ -3,6 +3,7 @@ version 1.0
 import "../tasks/tasks_taxon_filter.wdl" as taxon_filter
 import "../tasks/tasks_read_utils.wdl" as read_utils
 import "../tasks/tasks_assembly.wdl" as assembly
+import "../tasks/tasks_ncbi.wdl" as ncbi
 import "assemble_refbased.wdl" as assemble_refbased
 
 workflow assemble_denovo {
@@ -27,6 +28,7 @@ workflow assemble_denovo {
     File         trim_clip_db
 
     String       sample_name = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned")
+    String?      sample_original_name
   }
 
   parameter_meta {
@@ -53,10 +55,11 @@ workflow assemble_denovo {
     }
   }
 
-  if(length(reads_unmapped_bams)>1) {
+  if(length(reads_unmapped_bams)>1 || defined(sample_original_name)) {
       call read_utils.merge_and_reheader_bams as merge_reads {
           input:
               in_bams      = reads_unmapped_bams,
+              sample_name  = sample_original_name,
               out_basename = sample_name
       }
   }
@@ -107,8 +110,16 @@ workflow assemble_denovo {
           sample_name         = sample_name
   }
 
+  if (defined(sample_original_name)) {
+    call ncbi.rename_fasta_header {
+      input:
+        genome_fasta = refine.assembly_fasta,
+        new_name     = select_first([sample_original_name])
+    }
+  }
+
   output {
-    File    final_assembly_fasta                  = refine.assembly_fasta
+    File    final_assembly_fasta                  = select_first([rename_fasta_header.renamed_fasta, refine.assembly_fasta])
     File    aligned_only_reads_bam                = refine.align_to_self_merged_aligned_only_bam
     File    coverage_plot                         = refine.align_to_self_merged_coverage_plot
     Int     assembly_length                       = refine.assembly_length

From 30b2490188ae778f14b4db98d1ab25060c247922 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 12 Dec 2022 09:40:07 -0500
Subject: [PATCH 23/29] refashion assemble_denovo to parallelize most of its
 pre-assembly steps

---
 pipes/WDL/tasks/tasks_read_utils.wdl          |  40 ++++---
 pipes/WDL/workflows/assemble_denovo.wdl       | 109 +++++++++++-------
 pipes/WDL/workflows/assemble_refbased.wdl     |   4 +-
 .../test_inputs-assemble_denovo-local.json    |   3 +-
 .../test_outputs-assemble_denovo-local.json   |   3 +-
 5 files changed, 97 insertions(+), 62 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl
index 612619fd7..46dac190e 100644
--- a/pipes/WDL/tasks/tasks_read_utils.wdl
+++ b/pipes/WDL/tasks/tasks_read_utils.wdl
@@ -151,45 +151,53 @@ task merge_and_reheader_bams {
     
     Int disk_size = 750
 
-    command {
+    command <<<
         set -ex -o pipefail
 
         read_utils.py --version | tee VERSION
         mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
 
-        if [ ${length(in_bams)} -gt 1 ]; then
-            read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --JVMmemory="$mem_in_mb"m --loglevel DEBUG
+        if [ ~{length(in_bams)} -gt 1 ]; then
+            read_utils.py merge_bams ~{sep=' ' in_bams} merged.bam --JVMmemory="$mem_in_mb"m --loglevel DEBUG
         else
             echo "Skipping merge, only one input file"
-            cp ${sep=' ' in_bams} merged.bam
+            cp ~{sep=' ' in_bams} merged.bam
         fi    
 
         # remap all SM values to user specified value
-        if [ -n "${sample_name}" ]; then
+        if [ -n "~{sample_name}" ]; then
           # create sample name remapping table based on existing sample names
-          samtools view -H merged.bam | perl -n -e'/SM:(\S+)/ && print "SM\t$1\t'"${sample_name}"'\n"' | sort | uniq >> reheader_table.txt
+          samtools view -H merged.bam | perl -n -e'/SM:(\S+)/ && print "SM\t$1\t'"~{sample_name}"'\n"' | sort | uniq >> reheader_table.txt
         fi
 
         # remap arbitrary headers using user specified table
-        if [[ -f "${reheader_table}" ]]; then
-          cat "${reheader_table}" >> reheader_table.txt
+        if [[ -f "~{reheader_table}" ]]; then
+          cat "~{reheader_table}" >> reheader_table.txt
         fi
 
         # reheader bam file if requested
         if [ -s reheader_table.txt ]; then
-          read_utils.py reheader_bam merged.bam reheader_table.txt "${out_basename}.bam" --loglevel DEBUG
+          read_utils.py reheader_bam merged.bam reheader_table.txt "~{out_basename}.bam" --loglevel DEBUG
         else
-          mv merged.bam "${out_basename}.bam"
+          mv merged.bam "~{out_basename}.bam"
         fi
-    }
+
+        # summary stats on merged output
+        samtools view -c "~{out_basename}.bam" | tee read_count_merged
+        samtools flagstat "~{out_basename}.bam" | tee "~{out_basename}.bam.flagstat.txt"
+        reports.py fastqc "~{out_basename}.bam" "~{out_basename}.fastqc.html"
+    >>>
 
     output {
-        File   out_bam          = "${out_basename}.bam"
+        File   out_bam          = "~{out_basename}.bam"
+        Int    read_count       = read_int("read_count_merged")
+        File   flagstat         = "~{out_basename}.bam.flagstat.txt"
+        File   fastqc           = "~{out_basename}.fastqc.html"
         String viralngs_version = read_string("VERSION")
     }
 
     runtime {
-        docker: "${docker}"
+        docker: docker
         memory: "3 GB"
         cpu: 2
         disks:  "local-disk " + disk_size + " LOCAL"
@@ -210,7 +218,7 @@ task rmdup_ubam {
     String  method = "mvicuna"
 
     Int?    machine_mem_gb
-    String? docker = "quay.io/broadinstitute/viral-core:2.1.33"
+    String  docker = "quay.io/broadinstitute/viral-core:2.1.33"
   }
 
   Int disk_size = 375
@@ -246,7 +254,7 @@ task rmdup_ubam {
   }
 
   runtime {
-    docker: "${docker}"
+    docker: docker
     memory: select_first([machine_mem_gb, 7]) + " GB"
     cpu:    2
     disks:  "local-disk " + disk_size + " LOCAL"
@@ -306,7 +314,7 @@ task downsample_bams {
   }
 
   runtime {
-    docker: "${docker}"
+    docker: docker
     memory: select_first([machine_mem_gb, 3]) + " GB"
     cpu:    4
     disks:  "local-disk " + disk_size + " LOCAL"
diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl
index c202af262..8189d50c2 100644
--- a/pipes/WDL/workflows/assemble_denovo.wdl
+++ b/pipes/WDL/workflows/assemble_denovo.wdl
@@ -27,7 +27,7 @@ workflow assemble_denovo {
     File?        filter_to_taxon_db
     File         trim_clip_db
 
-    String       sample_name = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned")
+    String       out_basename = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned")
     String?      sample_original_name
   }
 
@@ -53,61 +53,91 @@ workflow assemble_denovo {
       description: "After denovo assembly, large contigs are scaffolded against a reference genome to determine orientation and to join contigs together, before further polishing by reads. You must supply at least one reference genome (all segments/chromomes in a single fasta file). If more than one reference is provided, contigs will be scaffolded against all of them and the one with the most complete assembly will be chosen for downstream polishing.",
       patterns: ["*.fasta"]
     }
+    out_basename: { description: "a filename-friendly basename for output files" }
+    sample_original_name: { description: "a (possibly filename-unfriendly) sample name for fasta and bam headers" }
   }
 
-  if(length(reads_unmapped_bams)>1 || defined(sample_original_name)) {
-      call read_utils.merge_and_reheader_bams as merge_reads {
+  # parallelize across provided input read files
+  scatter(reads_unmapped_bam in reads_unmapped_bams) {
+
+    # rename SM value in bam header if requested
+    if(defined(sample_original_name)) {
+      call read_utils.merge_and_reheader_bams as renamed_reads {
           input:
-              in_bams      = reads_unmapped_bams,
+              in_bams      = [reads_unmapped_bam],
               sample_name  = sample_original_name,
-              out_basename = sample_name
+              out_basename = out_basename
       }
-  }
-  File reads_unmapped_bam = select_first([merge_reads.out_bam, reads_unmapped_bams[0]])
+    }
+    File reads_unmapped_renamed_bams = select_first([renamed_reads.out_bam, reads_unmapped_bam])
+
+    # deplete host if requested
+    if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) {
+      call taxon_filter.deplete_taxa {
+        input:
+          raw_reads_unmapped_bam = reads_unmapped_renamed_bams,
+          bmtaggerDbs            = deplete_bmtaggerDbs,
+          blastDbs               = deplete_blastDbs,
+          bwaDbs                 = deplete_bwaDbs
+      }
+    }
+    File reads_depleted_bams = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam])
+
+    # select reads if requested
+    if(defined(filter_to_taxon_db)) {
+      call taxon_filter.filter_to_taxon {
+        input:
+          reads_unmapped_bam = reads_depleted_bams,
+          lastal_db_fasta    = select_first([filter_to_taxon_db])
+      }
+    }
+    File reads_taxfilt_bams = select_first([filter_to_taxon.taxfilt_bam, reads_depleted_bams])
 
-  if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) {
-    call taxon_filter.deplete_taxa {
+    # alignment-free PCR duplicate removal
+    call read_utils.rmdup_ubam {
       input:
-        raw_reads_unmapped_bam = reads_unmapped_bam,
-        bmtaggerDbs            = deplete_bmtaggerDbs,
-        blastDbs               = deplete_blastDbs,
-        bwaDbs                 = deplete_bwaDbs
+        reads_unmapped_bam = reads_taxfilt_bams
     }
   }
 
-  if(defined(filter_to_taxon_db)) {
-    call taxon_filter.filter_to_taxon {
+  # merge all reads into single file
+  call read_utils.merge_and_reheader_bams as merge_dedup_reads {
       input:
-        reads_unmapped_bam = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam]),
-        lastal_db_fasta    = select_first([filter_to_taxon_db])
-    }
+          in_bams      = rmdup_ubam.dedup_bam,
+          out_basename = out_basename
   }
-
-  call read_utils.rmdup_ubam {
-    input:
-      reads_unmapped_bam = select_first([filter_to_taxon.taxfilt_bam, deplete_taxa.cleaned_bam, reads_unmapped_bam])
+  call read_utils.merge_and_reheader_bams as merge_cleaned_reads {
+      input:
+          in_bams      = reads_depleted_bams,
+          out_basename = out_basename
+  }
+  call read_utils.merge_and_reheader_bams as merge_taxfilt_reads {
+      input:
+          in_bams      = reads_taxfilt_bams,
+          out_basename = out_basename
   }
 
+  # denovo assembly pipeline below
   call assembly.assemble {
     input:
-      reads_unmapped_bam = rmdup_ubam.dedup_bam,
+      reads_unmapped_bam = merge_dedup_reads.out_bam,
       trim_clip_db       = trim_clip_db,
       always_succeed     = true,
-      sample_name        = sample_name
+      sample_name        = out_basename
   }
 
   call assembly.scaffold {
     input:
       contigs_fasta           = assemble.contigs_fasta,
-      reads_bam               = select_first([filter_to_taxon.taxfilt_bam, deplete_taxa.cleaned_bam, reads_unmapped_bam]),
+      reads_bam               = merge_dedup_reads.out_bam,
       reference_genome_fasta  = reference_genome_fasta
   }
 
   call assemble_refbased.assemble_refbased as refine {
       input:
-          reads_unmapped_bams = [rmdup_ubam.dedup_bam],
+          reads_unmapped_bams = reads_depleted_bams, # assemble_refbased will scatter on individual bams
           reference_fasta     = scaffold.scaffold_fasta,
-          sample_name         = sample_name
+          sample_name         = out_basename
   }
 
   if (defined(sample_original_name)) {
@@ -127,18 +157,17 @@ workflow assemble_denovo {
     Int     reads_aligned                         = refine.align_to_self_merged_reads_aligned
     Float   mean_coverage                         = refine.align_to_self_merged_mean_coverage
     
-    File    cleaned_bam                           = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam])
-    File?   cleaned_fastqc                        = deplete_taxa.cleaned_fastqc
-    Int?    depletion_read_count_pre              = deplete_taxa.depletion_read_count_pre
-    Int?    depletion_read_count_post             = deplete_taxa.depletion_read_count_post
+    File    cleaned_bam                           = merge_cleaned_reads.out_bam
+    File    cleaned_fastqc                        = merge_cleaned_reads.fastqc
+    Int     depletion_read_count_post             = merge_cleaned_reads.read_count
     
-    File?   taxfilt_bam                           = filter_to_taxon.taxfilt_bam
-    File?   taxfilt_fastqc                        = filter_to_taxon.taxfilt_fastqc
-    Int?    filter_read_count_post                = filter_to_taxon.filter_read_count_post
+    File    taxfilt_bam                           = merge_taxfilt_reads.out_bam
+    File    taxfilt_fastqc                        = merge_taxfilt_reads.fastqc
+    Int     filter_read_count_post                = merge_taxfilt_reads.read_count
     
-    File    dedup_bam                             = rmdup_ubam.dedup_bam
-    File    dedup_fastqc                          = rmdup_ubam.dedup_fastqc
-    Int     dedup_read_count_post                 = rmdup_ubam.dedup_read_count_post
+    File    dedup_bam                             = merge_dedup_reads.out_bam
+    File    dedup_fastqc                          = merge_dedup_reads.fastqc
+    Int     dedup_read_count_post                 = merge_dedup_reads.read_count
     
     File    contigs_fasta                         = assemble.contigs_fasta
     File    subsampBam                            = assemble.subsampBam
@@ -162,15 +191,13 @@ workflow assemble_denovo {
 
     File    isnvs_vcf                             = refine.align_to_self_isnvs_vcf
     
-    File    aligned_bam                           = refine.align_to_self_merged_aligned_and_unaligned_bam[0]
-    File    aligned_only_reads_fastqc             = refine.align_to_ref_per_input_fastqc[0]
+    File    aligned_bam                           = refine.align_to_self_merged_aligned_only_bam
+    File    aligned_only_reads_fastqc             = refine.align_to_ref_fastqc
     File    coverage_tsv                          = refine.align_to_self_merged_coverage_tsv
     Int     read_pairs_aligned                    = refine.align_to_self_merged_read_pairs_aligned
     Float   bases_aligned                         = refine.align_to_self_merged_bases_aligned
     
     String  assembly_method = "viral-ngs/assemble_denovo"
-    String? deplete_viral_classify_version        = deplete_taxa.viralngs_version
-    String? taxfilt_viral_classify_version        = filter_to_taxon.viralngs_version
     String  assemble_viral_assemble_version       = assemble.viralngs_version
     String  scaffold_viral_assemble_version       = scaffold.viralngs_version
   }
diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl
index 06069f746..3001f9dfb 100644
--- a/pipes/WDL/workflows/assemble_refbased.wdl
+++ b/pipes/WDL/workflows/assemble_refbased.wdl
@@ -210,9 +210,9 @@ workflow assemble_refbased {
         Array[File] align_to_ref_per_input_aligned_flagstat      = align_to_ref.aligned_bam_flagstat
         Array[Int]  align_to_ref_per_input_reads_provided        = align_to_ref.reads_provided
         Array[Int]  align_to_ref_per_input_reads_aligned         = align_to_ref.reads_aligned
-        Array[File] align_to_ref_per_input_fastqc                = align_to_ref.aligned_only_reads_fastqc
-        
+
         File        align_to_ref_merged_aligned_trimmed_only_bam = aligned_trimmed_bam
+        File        align_to_ref_fastqc                          = select_first([merge_align_to_ref.fastqc, align_to_ref.aligned_only_reads_fastqc[0]])
         File        align_to_ref_merged_coverage_plot            = plot_ref_coverage.coverage_plot
         File        align_to_ref_merged_coverage_tsv             = plot_ref_coverage.coverage_tsv
         Int         align_to_ref_merged_reads_aligned            = plot_ref_coverage.reads_aligned
diff --git a/test/input/WDL/test_inputs-assemble_denovo-local.json b/test/input/WDL/test_inputs-assemble_denovo-local.json
index b5c46b913..e352b2713 100644
--- a/test/input/WDL/test_inputs-assemble_denovo-local.json
+++ b/test/input/WDL/test_inputs-assemble_denovo-local.json
@@ -5,5 +5,6 @@
   ],
   "assemble_denovo.filter_to_taxon_db": "test/input/ebov-makona.fasta",
   "assemble_denovo.trim_clip_db": "test/input/clipDb.fasta",
-  "assemble_denovo.reference_genome_fasta": ["test/input/ebov-makona.fasta"]
+  "assemble_denovo.reference_genome_fasta": ["test/input/ebov-makona.fasta"],
+  "assemble_denovo.sample_original_name": "USA/ this ? is an un-friendly sample_name! / 1999"
 }
diff --git a/test/input/WDL/test_outputs-assemble_denovo-local.json b/test/input/WDL/test_outputs-assemble_denovo-local.json
index a1a502cf1..485bd6a39 100644
--- a/test/input/WDL/test_outputs-assemble_denovo-local.json
+++ b/test/input/WDL/test_outputs-assemble_denovo-local.json
@@ -8,6 +8,5 @@
     "assemble_denovo.assembly_length_unambiguous": 18843,
     "assemble_denovo.assembly_length": 18843,
     "assemble_denovo.filter_read_count_post": 18710,
-    "assemble_denovo.depletion_read_count_post": 18710,
-    "assemble_denovo.depletion_read_count_pre": 18710
+    "assemble_denovo.depletion_read_count_post": 18710
 }

From f919c64af18c8f42c68b2acb6c6f9054cc51695a Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 12 Dec 2022 09:45:57 -0500
Subject: [PATCH 24/29] fix in metagenomic_denovo

---
 pipes/WDL/workflows/metagenomic_denovo.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipes/WDL/workflows/metagenomic_denovo.wdl b/pipes/WDL/workflows/metagenomic_denovo.wdl
index f73a9aa4b..bebf5db73 100644
--- a/pipes/WDL/workflows/metagenomic_denovo.wdl
+++ b/pipes/WDL/workflows/metagenomic_denovo.wdl
@@ -248,8 +248,8 @@ workflow metagenomic_denovo {
 
     File    isnvs_vcf                             = refine.align_to_self_isnvs_vcf
     
-    File    aligned_bam                           = refine.align_to_self_merged_aligned_and_unaligned_bam[0]
-    File    aligned_only_reads_fastqc             = refine.align_to_ref_per_input_fastqc[0]
+    File    aligned_bam                           = refine.align_to_self_merged_aligned_only_bam
+    File    aligned_only_reads_fastqc             = refine.align_to_ref_fastqc
     File    coverage_tsv                          = refine.align_to_self_merged_coverage_tsv
     Int     read_pairs_aligned                    = refine.align_to_self_merged_read_pairs_aligned
     Float   bases_aligned                         = refine.align_to_self_merged_bases_aligned

From 3aaadbf81628ccfe9c591ff127a36499c5123442 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 12 Dec 2022 09:49:40 -0500
Subject: [PATCH 25/29] fix scaffold_and_refine

---
 pipes/WDL/workflows/scaffold_and_refine.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl
index adb4411df..57926284c 100644
--- a/pipes/WDL/workflows/scaffold_and_refine.wdl
+++ b/pipes/WDL/workflows/scaffold_and_refine.wdl
@@ -53,8 +53,8 @@ workflow scaffold_and_refine {
 
     File   isnvsFile                             = refine.align_to_self_isnvs_vcf
     
-    File   aligned_bam                           = refine.align_to_self_merged_aligned_and_unaligned_bam[0]
-    File   aligned_only_reads_fastqc             = refine.align_to_ref_per_input_fastqc[0]
+    File   aligned_bam                           = refine.align_to_self_merged_aligned_only_bam
+    File   aligned_only_reads_fastqc             = refine.align_to_ref_fastqc
     File   coverage_tsv                          = refine.align_to_self_merged_coverage_tsv
     Int    read_pairs_aligned                    = refine.align_to_self_merged_read_pairs_aligned
     Float  bases_aligned                         = refine.align_to_self_merged_bases_aligned

From 670dafaa9ee8e9fd781bad685beb93feb3e0299c Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 12 Dec 2022 11:03:57 -0500
Subject: [PATCH 26/29] bump viral-assemble and viral-classify docker tags

---
 pipes/WDL/tasks/tasks_assembly.wdl     |  8 ++++----
 pipes/WDL/tasks/tasks_metagenomics.wdl | 16 ++++++++--------
 pipes/WDL/tasks/tasks_reports.wdl      |  4 ++--
 pipes/WDL/tasks/tasks_taxon_filter.wdl |  6 +++---
 requirements-modules.txt               |  4 ++--
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl
index 8ef66a37f..e1b20dc8b 100644
--- a/pipes/WDL/tasks/tasks_assembly.wdl
+++ b/pipes/WDL/tasks/tasks_assembly.wdl
@@ -15,7 +15,7 @@ task assemble {
       String   sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")
       
       Int?     machine_mem_gb
-      String   docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2"
+      String   docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
     }
 
     Int disk_size = 375
@@ -81,7 +81,7 @@ task scaffold {
       Float?       scaffold_min_pct_contig_aligned
 
       Int?         machine_mem_gb
-      String       docker="quay.io/broadinstitute/viral-assemble:2.1.20.2"
+      String       docker="quay.io/broadinstitute/viral-assemble:2.1.33.0"
 
       # do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
       String       sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
@@ -424,7 +424,7 @@ task refine_assembly_with_aligned_reads {
       Int      min_coverage = 3
 
       Int?     machine_mem_gb
-      String   docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2"
+      String   docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
     }
 
     Int disk_size = 375
@@ -534,7 +534,7 @@ task refine_2x_and_plot {
       String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k"
 
       Int?    machine_mem_gb
-      String  docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2"
+      String  docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
 
       # do this in two steps in case the input doesn't actually have "cleaned" in the name
       String  sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl
index add2746cc..d9b806193 100644
--- a/pipes/WDL/tasks/tasks_metagenomics.wdl
+++ b/pipes/WDL/tasks/tasks_metagenomics.wdl
@@ -11,7 +11,7 @@ task krakenuniq {
     File        krona_taxonomy_db_tgz  # taxonomy.tab
 
     Int?        machine_mem_gb
-    String      docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String      docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   Int disk_size = 750
@@ -140,7 +140,7 @@ task build_krakenuniq_db {
     Int?     zstd_compression_level
 
     Int?     machine_mem_gb
-    String   docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String   docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   Int disk_size = 750
@@ -210,7 +210,7 @@ task kraken2 {
     Int?   min_base_qual
 
     Int?   machine_mem_gb
-    String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   parameter_meta {
@@ -345,7 +345,7 @@ task build_kraken2_db {
     Int?          zstd_compression_level
 
     Int?          machine_mem_gb
-    String        docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String        docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   Int disk_size = 750
@@ -487,7 +487,7 @@ task blastx {
     File   krona_taxonomy_db_tgz
 
     Int?   machine_mem_gb
-    String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   parameter_meta {
@@ -577,7 +577,7 @@ task krona {
     Int?         magnitude_column
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   Int disk_size = 50
@@ -684,7 +684,7 @@ task filter_bam_to_taxa {
     String         out_filename_suffix = "filtered"
 
     Int?           machine_mem_gb
-    String         docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String         docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix
@@ -771,7 +771,7 @@ task kaiju {
     File   krona_taxonomy_db_tgz  # taxonomy/taxonomy.tab
 
     Int?   machine_mem_gb
-    String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   String   input_basename = basename(reads_unmapped_bam, ".bam")
diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl
index d5cb61530..b695ea7ea 100644
--- a/pipes/WDL/tasks/tasks_reports.wdl
+++ b/pipes/WDL/tasks/tasks_reports.wdl
@@ -401,7 +401,7 @@ task aggregate_metagenomics_reports {
     String       aggregate_taxlevel_focus                 = "species"
     Int          aggregate_top_N_hits                     = 5
 
-    String       docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   parameter_meta {
@@ -549,7 +549,7 @@ task compare_two_genomes {
     File   genome_two
     String out_basename
 
-    String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2"
+    String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
   }
 
   Int disk_size = 50
diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl
index 6a654729d..905acf4be 100644
--- a/pipes/WDL/tasks/tasks_taxon_filter.wdl
+++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl
@@ -14,7 +14,7 @@ task deplete_taxa {
 
     Int?         cpu=8
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   parameter_meta {
@@ -113,7 +113,7 @@ task filter_to_taxon {
     String?  neg_control_prefixes_space_separated = "neg water NTC"
 
     Int?     machine_mem_gb
-    String   docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String   docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   # do this in two steps in case the input doesn't actually have "cleaned" in the name
@@ -172,7 +172,7 @@ task build_lastal_db {
     File   sequences_fasta
 
     Int?   machine_mem_gb
-    String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
+    String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
   }
 
   String db_name = basename(sequences_fasta, ".fasta")
diff --git a/requirements-modules.txt b/requirements-modules.txt
index 100a605a7..71870496b 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -1,6 +1,6 @@
 broadinstitute/viral-core=2.1.33
-broadinstitute/viral-assemble=2.1.20.2
-broadinstitute/viral-classify=2.1.20.0
+broadinstitute/viral-assemble=2.1.33.0
+broadinstitute/viral-classify=2.1.33.0
 broadinstitute/viral-phylo=2.1.20.0
 broadinstitute/py3-bio=0.1.2
 broadinstitute/beast-beagle-cuda=1.10.5pre

From df0cdd7d0a2997f275d864d8c7dacdf40ca3e73b Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 12 Dec 2022 11:16:56 -0500
Subject: [PATCH 27/29] bump nextclade

---
 pipes/WDL/tasks/tasks_nextstrain.wdl | 4 ++--
 requirements-modules.txt             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index d12608d31..4d71a512c 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -13,7 +13,7 @@ task nextclade_one_sample {
         File? pcr_primers_csv
         File? virus_properties
         String? dataset_name
-        String docker = "nextstrain/nextclade:2.5.0"
+        String docker = "nextstrain/nextclade:2.9.1"
     }
     String basename = basename(genome_fasta, ".fasta")
     Int disk_size = 50
@@ -100,7 +100,7 @@ task nextclade_many_samples {
         String?      dataset_name
         String       basename
         File?        genome_ids_setdefault_blank
-        String       docker = "nextstrain/nextclade:2.5.0"
+        String       docker = "nextstrain/nextclade:2.9.1"
     }
     Int disk_size = 100
     command <<<
diff --git a/requirements-modules.txt b/requirements-modules.txt
index 71870496b..cf317546e 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -8,4 +8,4 @@ broadinstitute/ncbi-tools=2.10.7.10
 nextstrain/base=build-20211012T204409Z
 andersenlabapps/ivar=1.3.1
 quay.io/staphb/pangolin=4.1.2-pdata-1.14
-nextstrain/nextclade=2.5.0
+nextstrain/nextclade=2.9.1

From c709a2294f11d06cd351ac8e0c9429b93da091bd Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 12 Dec 2022 14:19:28 -0500
Subject: [PATCH 28/29] bump viral-phylo including upstream bugfixes

---
 pipes/WDL/tasks/tasks_interhost.wdl  |  6 +++---
 pipes/WDL/tasks/tasks_intrahost.wdl  |  6 +++---
 pipes/WDL/tasks/tasks_ncbi.wdl       | 12 ++++++------
 pipes/WDL/tasks/tasks_nextstrain.wdl |  4 ++--
 requirements-modules.txt             |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl
index 7e51f28bc..b65d81f99 100644
--- a/pipes/WDL/tasks/tasks_interhost.wdl
+++ b/pipes/WDL/tasks/tasks_interhost.wdl
@@ -9,7 +9,7 @@ task multi_align_mafft_ref {
     Float?       mafft_gapOpeningPenalty
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   String         fasta_basename = basename(reference_fasta, '.fasta')
@@ -56,7 +56,7 @@ task multi_align_mafft {
     Float?       mafft_gapOpeningPenalty
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   Int disk_size = 200
@@ -282,7 +282,7 @@ task merge_vcfs_gatk {
     File        ref_fasta
 
     Int?        machine_mem_gb
-    String      docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String      docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
 
     String      output_prefix = "merged"
   }
diff --git a/pipes/WDL/tasks/tasks_intrahost.wdl b/pipes/WDL/tasks/tasks_intrahost.wdl
index 3735d7d6b..551f7d450 100644
--- a/pipes/WDL/tasks/tasks_intrahost.wdl
+++ b/pipes/WDL/tasks/tasks_intrahost.wdl
@@ -179,7 +179,7 @@ task isnvs_per_sample {
     Boolean removeDoublyMappedReads = true
 
     Int?    machine_mem_gb
-    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
 
     String  sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")
   }
@@ -222,7 +222,7 @@ task isnvs_vcf {
     Boolean        naiveFilter = false
 
     Int?           machine_mem_gb
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   parameter_meta {
@@ -296,7 +296,7 @@ task annotate_vcf_snpeff {
     String?        emailAddress
 
     Int?           machine_mem_gb
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
 
     String         output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
   }
diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl
index 519a6aa6d..73510da29 100644
--- a/pipes/WDL/tasks/tasks_ncbi.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi.wdl
@@ -6,7 +6,7 @@ task download_fasta {
     Array[String]+ accessions
     String         emailAddress
 
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   command {
@@ -38,7 +38,7 @@ task download_annotations {
     String         emailAddress
     String         combined_out_prefix
 
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   command <<<
@@ -85,7 +85,7 @@ task annot_transfer {
     File         reference_fasta
     Array[File]+ reference_feature_table
 
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   parameter_meta {
@@ -139,7 +139,7 @@ task align_and_annot_transfer_single {
     Array[File]+ reference_fastas
     Array[File]+ reference_feature_tables
 
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   parameter_meta {
@@ -566,7 +566,7 @@ task biosample_to_genbank {
     File?   filter_to_ids
 
     Boolean s_dropout_note = true
-    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
   String base = basename(biosample_attributes, ".txt")
   command {
@@ -732,7 +732,7 @@ task prepare_genbank {
     String?      assembly_method_version
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
   }
 
   parameter_meta {
diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index 4d71a512c..280f2cca0 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -949,7 +949,7 @@ task mafft_one_chr {
         Boolean  large = false
         Boolean  memsavetree = false
 
-        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
         Int      mem_size = 500
         Int      cpus = 64
     }
@@ -1037,7 +1037,7 @@ task mafft_one_chr_chunked {
         Int      batch_chunk_size = 2000
         Int      threads_per_job = 2
 
-        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
+        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
         Int      mem_size = 32
         Int      cpus = 96
     }
diff --git a/requirements-modules.txt b/requirements-modules.txt
index cf317546e..e4d8b4c44 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -1,7 +1,7 @@
 broadinstitute/viral-core=2.1.33
 broadinstitute/viral-assemble=2.1.33.0
 broadinstitute/viral-classify=2.1.33.0
-broadinstitute/viral-phylo=2.1.20.0
+broadinstitute/viral-phylo=2.1.20.1
 broadinstitute/py3-bio=0.1.2
 broadinstitute/beast-beagle-cuda=1.10.5pre
 broadinstitute/ncbi-tools=2.10.7.10

From 6484b9ea0e2c317bb1b8eab71d5021f19ab66145 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 12 Dec 2022 17:58:15 -0500
Subject: [PATCH 29/29] update viral-phylo docker

---
 pipes/WDL/tasks/tasks_interhost.wdl  |  6 +++---
 pipes/WDL/tasks/tasks_intrahost.wdl  |  6 +++---
 pipes/WDL/tasks/tasks_ncbi.wdl       | 12 ++++++------
 pipes/WDL/tasks/tasks_nextstrain.wdl |  4 ++--
 requirements-modules.txt             |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl
index b65d81f99..bf4fb1b2c 100644
--- a/pipes/WDL/tasks/tasks_interhost.wdl
+++ b/pipes/WDL/tasks/tasks_interhost.wdl
@@ -9,7 +9,7 @@ task multi_align_mafft_ref {
     Float?       mafft_gapOpeningPenalty
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   String         fasta_basename = basename(reference_fasta, '.fasta')
@@ -56,7 +56,7 @@ task multi_align_mafft {
     Float?       mafft_gapOpeningPenalty
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   Int disk_size = 200
@@ -282,7 +282,7 @@ task merge_vcfs_gatk {
     File        ref_fasta
 
     Int?        machine_mem_gb
-    String      docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String      docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
 
     String      output_prefix = "merged"
   }
diff --git a/pipes/WDL/tasks/tasks_intrahost.wdl b/pipes/WDL/tasks/tasks_intrahost.wdl
index 551f7d450..d2ad85c84 100644
--- a/pipes/WDL/tasks/tasks_intrahost.wdl
+++ b/pipes/WDL/tasks/tasks_intrahost.wdl
@@ -179,7 +179,7 @@ task isnvs_per_sample {
     Boolean removeDoublyMappedReads = true
 
     Int?    machine_mem_gb
-    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
 
     String  sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")
   }
@@ -222,7 +222,7 @@ task isnvs_vcf {
     Boolean        naiveFilter = false
 
     Int?           machine_mem_gb
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   parameter_meta {
@@ -296,7 +296,7 @@ task annotate_vcf_snpeff {
     String?        emailAddress
 
     Int?           machine_mem_gb
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
 
     String         output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
   }
diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl
index 73510da29..beddf490e 100644
--- a/pipes/WDL/tasks/tasks_ncbi.wdl
+++ b/pipes/WDL/tasks/tasks_ncbi.wdl
@@ -6,7 +6,7 @@ task download_fasta {
     Array[String]+ accessions
     String         emailAddress
 
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   command {
@@ -38,7 +38,7 @@ task download_annotations {
     String         emailAddress
     String         combined_out_prefix
 
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   command <<<
@@ -85,7 +85,7 @@ task annot_transfer {
     File         reference_fasta
     Array[File]+ reference_feature_table
 
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   parameter_meta {
@@ -139,7 +139,7 @@ task align_and_annot_transfer_single {
     Array[File]+ reference_fastas
     Array[File]+ reference_feature_tables
 
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   parameter_meta {
@@ -566,7 +566,7 @@ task biosample_to_genbank {
     File?   filter_to_ids
 
     Boolean s_dropout_note = true
-    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String  docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
   String base = basename(biosample_attributes, ".txt")
   command {
@@ -732,7 +732,7 @@ task prepare_genbank {
     String?      assembly_method_version
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
   }
 
   parameter_meta {
diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index 280f2cca0..d077421ce 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -949,7 +949,7 @@ task mafft_one_chr {
         Boolean  large = false
         Boolean  memsavetree = false
 
-        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
         Int      mem_size = 500
         Int      cpus = 64
     }
@@ -1037,7 +1037,7 @@ task mafft_one_chr_chunked {
         Int      batch_chunk_size = 2000
         Int      threads_per_job = 2
 
-        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1"
+        String   docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
         Int      mem_size = 32
         Int      cpus = 96
     }
diff --git a/requirements-modules.txt b/requirements-modules.txt
index e4d8b4c44..cda9fbce8 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -1,7 +1,7 @@
 broadinstitute/viral-core=2.1.33
 broadinstitute/viral-assemble=2.1.33.0
 broadinstitute/viral-classify=2.1.33.0
-broadinstitute/viral-phylo=2.1.20.1
+broadinstitute/viral-phylo=2.1.20.2
 broadinstitute/py3-bio=0.1.2
 broadinstitute/beast-beagle-cuda=1.10.5pre
 broadinstitute/ncbi-tools=2.10.7.10