From 03cfdecc5d2446db3903739a102cb8e83e8861a2 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 09:20:43 -0500 Subject: [PATCH 01/29] fix dxWDL namespace clash error --- .dockstore.yml | 4 ++-- .../workflows/{read_depths.wdl => calc_bam_read_depths.wdl} | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename pipes/WDL/workflows/{read_depths.wdl => calc_bam_read_depths.wdl} (95%) diff --git a/.dockstore.yml b/.dockstore.yml index eba30d3a5..00d98dd64 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -279,9 +279,9 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/sarscov2_lineages.wdl testParameterFiles: - /test/input/WDL/test_inputs-sarscov2_lineages-local.json - - name: read_depths + - name: calc_bam_read_depths subclass: WDL - primaryDescriptorPath: /pipes/WDL/workflows/read_depths.wdl + primaryDescriptorPath: /pipes/WDL/workflows/calc_bam_read_depths.wdl testParameterFiles: - empty.json - name: sarscov2_gisaid_ingest diff --git a/pipes/WDL/workflows/read_depths.wdl b/pipes/WDL/workflows/calc_bam_read_depths.wdl similarity index 95% rename from pipes/WDL/workflows/read_depths.wdl rename to pipes/WDL/workflows/calc_bam_read_depths.wdl index 19d9c056e..9f0479285 100644 --- a/pipes/WDL/workflows/read_depths.wdl +++ b/pipes/WDL/workflows/calc_bam_read_depths.wdl @@ -2,7 +2,7 @@ version 1.0 import "../tasks/tasks_read_utils.wdl" as read_utils -workflow read_depths { +workflow calc_bam_read_depths { meta { description: "Generates read depth tables." author: "Broad Viral Genomics" From e0a409d863d4dceb63fa3f2c242310207927f01d Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 09:28:25 -0500 Subject: [PATCH 02/29] drop unused optional param --- pipes/WDL/tasks/tasks_assembly.wdl | 30 ++++++++++--------------- pipes/WDL/workflows/classify_multi.wdl | 1 - pipes/WDL/workflows/classify_single.wdl | 1 - pipes/WDL/workflows/contigs.wdl | 1 - pipes/WDL/workflows/demux_metag.wdl | 1 - pipes/WDL/workflows/demux_plus.wdl | 1 - 6 files changed, 12 insertions(+), 23 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 3c625f5a7..b02640c66 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -9,7 +9,6 @@ task assemble { Int spades_min_contig_len = 0 String? spades_options - String assembler = "spades" Boolean always_succeed = false # do this in two steps in case the input doesn't actually have "taxfilt" in the name @@ -30,28 +29,23 @@ task assemble { assembly.py --version | tee VERSION - if [[ "~{assembler}" == "spades" ]]; then - assembly.py assemble_spades \ - ~{reads_unmapped_bam} \ - ~{trim_clip_db} \ - ~{sample_name}.assembly1-~{assembler}.fasta \ - ~{'--nReads=' + spades_n_reads} \ - ~{true="--alwaysSucceed" false="" always_succeed} \ - ~{'--minContigLen=' + spades_min_contig_len} \ - ~{'--spadesOpts="' + spades_options + '"'} \ - --memLimitGb $mem_in_gb \ - --outReads=~{sample_name}.subsamp.bam \ - --loglevel=DEBUG - else - echo "unrecognized assembler ~{assembler}" >&2 - exit 1 - fi + assembly.py assemble_spades \ + ~{reads_unmapped_bam} \ + ~{trim_clip_db} \ + ~{sample_name}.assembly1-spades.fasta \ + ~{'--nReads=' + spades_n_reads} \ + ~{true="--alwaysSucceed" false="" always_succeed} \ + ~{'--minContigLen=' + spades_min_contig_len} \ + ~{'--spadesOpts="' + spades_options + '"'} \ + --memLimitGb $mem_in_gb \ + --outReads=~{sample_name}.subsamp.bam \ + --loglevel=DEBUG samtools view -c ~{sample_name}.subsamp.bam | tee subsample_read_count >&2 } output { - File contigs_fasta = "~{sample_name}.assembly1-~{assembler}.fasta" + File contigs_fasta = "~{sample_name}.assembly1-spades.fasta" File subsampBam = "~{sample_name}.subsamp.bam" Int subsample_read_count = read_int("subsample_read_count") String viralngs_version = read_string("VERSION") diff --git a/pipes/WDL/workflows/classify_multi.wdl b/pipes/WDL/workflows/classify_multi.wdl index 507474dd1..dbb45ebcf 100644 --- a/pipes/WDL/workflows/classify_multi.wdl +++ b/pipes/WDL/workflows/classify_multi.wdl @@ -101,7 +101,6 @@ workflow classify_multi { } call assembly.assemble as spades { input: - assembler = "spades", reads_unmapped_bam = rmdup_ubam.dedup_bam, trim_clip_db = trim_clip_db, always_succeed = true diff --git a/pipes/WDL/workflows/classify_single.wdl b/pipes/WDL/workflows/classify_single.wdl index 898c89560..e26c05b34 100644 --- a/pipes/WDL/workflows/classify_single.wdl +++ b/pipes/WDL/workflows/classify_single.wdl @@ -93,7 +93,6 @@ workflow classify_single { } call assembly.assemble as spades { input: - assembler = "spades", reads_unmapped_bam = rmdup_ubam.dedup_bam, trim_clip_db = trim_clip_db, always_succeed = true diff --git a/pipes/WDL/workflows/contigs.wdl b/pipes/WDL/workflows/contigs.wdl index 0f7312be5..08438e337 100644 --- a/pipes/WDL/workflows/contigs.wdl +++ b/pipes/WDL/workflows/contigs.wdl @@ -31,7 +31,6 @@ workflow contigs { call assembly.assemble as spades { input: - assembler = "spades", reads_unmapped_bam = rmdup_ubam.dedup_bam } diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl index 575602c4c..f69c2b2a6 100644 --- a/pipes/WDL/workflows/demux_metag.wdl +++ b/pipes/WDL/workflows/demux_metag.wdl @@ -43,7 +43,6 @@ workflow demux_metag { } call assembly.assemble as spades { input: - assembler = "spades", reads_unmapped_bam = rmdup_ubam.dedup_bam, trim_clip_db = trim_clip_db, always_succeed = true diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl index 19a8c0f5e..69e276af3 100644 --- a/pipes/WDL/workflows/demux_plus.wdl +++ b/pipes/WDL/workflows/demux_plus.wdl @@ -41,7 +41,6 @@ workflow demux_plus { } call assembly.assemble as spades { input: - assembler = "spades", reads_unmapped_bam = deplete.cleaned_bam, trim_clip_db = trim_clip_db, always_succeed = true From d51e192b48b0036fb4911b2d2fa2a78e9145944c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 09:34:50 -0500 Subject: [PATCH 03/29] expose min_contig_len at scaffolding step --- pipes/WDL/tasks/tasks_assembly.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index b02640c66..880c60fbb 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -6,7 +6,7 @@ task assemble { File trim_clip_db Int spades_n_reads = 10000000 - Int spades_min_contig_len = 0 + Int? spades_min_contig_len String? spades_options Boolean always_succeed = false @@ -77,6 +77,7 @@ task scaffold { Int? nucmer_max_gap Int? nucmer_min_match Int? nucmer_min_cluster + Int? scaffold_min_contig_len Float? scaffold_min_pct_contig_aligned Int? machine_mem_gb @@ -100,6 +101,7 @@ task scaffold { ~{contigs_fasta} \ ~{sep=' ' reference_genome_fasta} \ ~{sample_name}.intermediate_scaffold.fasta \ + ~{'--min_contig_len=' + scaffold_min_contig_len} \ ~{'--maxgap=' + nucmer_max_gap} \ ~{'--minmatch=' + nucmer_min_match} \ ~{'--mincluster=' + nucmer_min_cluster} \ From 2de740d907e86bcc597354fe18956d7c88694b4e Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 10:23:26 -0500 Subject: [PATCH 04/29] alter scaffolding_chosen_ref_name -> _names and make it an Array --- pipes/WDL/tasks/tasks_assembly.wdl | 4 ++-- pipes/WDL/workflows/assemble_denovo.wdl | 2 +- pipes/WDL/workflows/metagenomic_denovo.wdl | 2 +- pipes/WDL/workflows/scaffold_and_refine.wdl | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 880c60fbb..8ef66a37f 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -111,7 +111,7 @@ task scaffold { --outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \ --loglevel=DEBUG - grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ~{sample_name}.scaffolding_chosen_ref.txt + grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > ~{sample_name}.scaffolding_chosen_refs.txt assembly.py gapfill_gap2seq \ ~{sample_name}.intermediate_scaffold.fasta \ @@ -142,7 +142,7 @@ task scaffold { File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta" Int assembly_preimpute_length = read_int("assembly_preimpute_length") Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous") - String scaffolding_chosen_ref_name = read_string("~{sample_name}.scaffolding_chosen_ref.txt") + Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt") File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta" File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt" File scaffolding_alt_contigs = "~{sample_name}.scaffolding_alt_contigs.fasta" diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl index a9ae94021..7ba5b5be1 100644 --- a/pipes/WDL/workflows/assemble_denovo.wdl +++ b/pipes/WDL/workflows/assemble_denovo.wdl @@ -129,7 +129,7 @@ workflow assemble_denovo { File intermediate_gapfill_fasta = scaffold.intermediate_gapfill_fasta Int assembly_preimpute_length = scaffold.assembly_preimpute_length Int assembly_preimpute_length_unambiguous = scaffold.assembly_preimpute_length_unambiguous - String scaffolding_chosen_ref_name = scaffold.scaffolding_chosen_ref_name + Array[String] scaffolding_chosen_ref_names = scaffold.scaffolding_chosen_ref_names File scaffolding_stats = scaffold.scaffolding_stats File scaffolding_alt_contigs = scaffold.scaffolding_alt_contigs diff --git a/pipes/WDL/workflows/metagenomic_denovo.wdl b/pipes/WDL/workflows/metagenomic_denovo.wdl index 9abdd116e..f73a9aa4b 100644 --- a/pipes/WDL/workflows/metagenomic_denovo.wdl +++ b/pipes/WDL/workflows/metagenomic_denovo.wdl @@ -235,7 +235,7 @@ workflow metagenomic_denovo { File intermediate_gapfill_fasta = scaffold.intermediate_gapfill_fasta Int assembly_preimpute_length = scaffold.assembly_preimpute_length Int assembly_preimpute_length_unambiguous = scaffold.assembly_preimpute_length_unambiguous - String scaffolding_chosen_ref_name = scaffold.scaffolding_chosen_ref_name + Array[String] scaffolding_chosen_ref_names = scaffold.scaffolding_chosen_ref_names File scaffolding_stats = scaffold.scaffolding_stats File scaffolding_alt_contigs = scaffold.scaffolding_alt_contigs diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl index b7bfc46aa..adb4411df 100644 --- a/pipes/WDL/workflows/scaffold_and_refine.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine.wdl @@ -40,7 +40,7 @@ workflow scaffold_and_refine { File intermediate_gapfill_fasta = scaffold.intermediate_gapfill_fasta Int assembly_preimpute_length = scaffold.assembly_preimpute_length Int assembly_preimpute_length_unambiguous = scaffold.assembly_preimpute_length_unambiguous - String scaffolding_chosen_ref_name = scaffold.scaffolding_chosen_ref_name + Array[String] scaffolding_chosen_ref_names = scaffold.scaffolding_chosen_ref_names File scaffolding_stats = scaffold.scaffolding_stats File scaffolding_alt_contigs = scaffold.scaffolding_alt_contigs From ddf4688ec8b2e3bff62e8d358774cfd7a22e5c17 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 12:24:24 -0500 Subject: [PATCH 05/29] dust off genbank workflow to do more work for you --- pipes/WDL/tasks/tasks_ncbi_tools.wdl | 29 ++++++++++++ pipes/WDL/workflows/genbank.wdl | 66 ++++++++++++++++------------ 2 files changed, 66 insertions(+), 29 deletions(-) diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl index 4bbcea347..8605181dd 100644 --- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl +++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl @@ -145,6 +145,35 @@ task Fetch_SRA_to_BAM { } } +task fetch_genbank_metadata { + input { + String genbank_accession + String docker = "quay.io/broadinstitute/ncbi-tools:2.10.7.10" + } + Int disk_size = 50 + command <<< + set -e + esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json > gb.json + jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]| {(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > metadata.json + jq -r '.db_xref' meta.json | grep ^taxon: | cut -f 2 -d : > taxid.txt + jq -r '.organism' meta.json > organism.txt + >>> + output { + Map[String,String] metadata = read_json("metadata.json") + String taxid = read_string("taxid.txt") + String organism = read_string("organism.txt") + } + runtime { + cpu: 1 + memory: "1 GB" + disks: "local-disk " + disk_size + " LOCAL" + disk: disk_size + " GB" # TES + dx_instance_type: "mem1_ssd1_v2_x2" + docker: docker + maxRetries: 2 + } +} + task biosample_tsv_filter_preexisting { input { File meta_submit_tsv diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl index 3c8a183a8..5d6a1bd99 100644 --- a/pipes/WDL/workflows/genbank.wdl +++ b/pipes/WDL/workflows/genbank.wdl @@ -1,7 +1,8 @@ version 1.0 -import "../tasks/tasks_interhost.wdl" as interhost import "../tasks/tasks_ncbi.wdl" as ncbi +import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools +import "../tasks/tasks_reports.wdl" as reports workflow genbank { @@ -13,20 +14,18 @@ workflow genbank { } input { - Array[File]+ reference_fastas - Array[File]+ reference_feature_tables - Array[File]+ assemblies_fasta + Array[File]+ assemblies_fasta + Array[File]+ alignments_bams + Array[String]+ reference_accessions - String? author_list # of the form "Lastname,A.B., Lastname,C.,"; optional alternative to names in author_sbt_defaults_yaml + String email_address # required for fetching data from NCBI APIs + String author_list # of the form "Lastname,A.B., Lastname,C.,"; optional alternative to names in author_sbt_defaults_yaml File author_sbt_defaults_yaml # defaults to fill in for author_sbt file (including both author and non-author fields) File author_sbt_j2_template File biosample_attributes - Int taxid - File? coverage_table - String? sequencingTech + String sequencingTech String? comment - String? organism - String? molType='cRNA' + String molType='cRNA' } parameter_meta { @@ -34,14 +33,10 @@ workflow genbank { description: "Genomes to prepare for Genbank submission. One file per genome: all segments/chromosomes included in one file. All fasta files must contain exactly the same number of sequences as reference_fasta (which must equal the number of files in reference_annot_tbl).", patterns: ["*.fasta"] } - reference_fastas: { - description: "Reference genome, each segment/chromosome in a separate fasta file, in the exact same count and order as the segments/chromosomes described in genome_fasta. Headers must be Genbank accessions.", + reference_accessions: { + description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in genome_fasta.", patterns: ["*.fasta"] } - reference_feature_tables: { - description: "NCBI Genbank feature table, each segment/chromosome in a separate TBL file, in the exact same count and order as the segments/chromosomes described in genome_fasta and reference_fastas. Accession numbers in the TBL files must correspond exactly to those in reference_fasta.", - patterns: ["*.tbl"] - } author_list: { description: "A string containing a space-delimited list with of author surnames separated by first name and (optional) middle initial. Ex. 'Lastname,Firstname, Last-hypenated,First,M., Last,F.'" } @@ -56,9 +51,6 @@ workflow genbank { description: "A post-submission attributes file from NCBI BioSample, which is available at https://submit.ncbi.nlm.nih.gov/subs/ and clicking on 'Download attributes file with BioSample accessions'.", patterns: ["*.txt", "*.tsv"] } - taxid: { - description: "The NCBI taxonomy ID for the species being submitted in this batch (all sequences in this batch must belong to the same taxid). https://www.ncbi.nlm.nih.gov/taxonomy/" - } coverage_table: { description: "A two column tab text file mapping sample IDs (first column) to average sequencing coverage (second column, floating point number).", patterns: ["*.txt", "*.tsv"], @@ -68,10 +60,6 @@ workflow genbank { description: "The type of sequencer used to generate reads. NCBI has a controlled vocabulary for this value which can be found here: https://submit.ncbi.nlm.nih.gov/structcomment/nongenomes/", category: "common" } - organism: { - description: "The scientific name for the organism being submitted. This is typically the species name and should match the name given by the NCBI Taxonomy database. For more info, see: https://www.ncbi.nlm.nih.gov/Sequin/sequin.hlp.html#Organism", - category: "common" - } molType: { description: "The type of molecule being described. This defaults to 'cRNA' as this pipeline is most commonly used for viral submissions, but any value allowed by the INSDC controlled vocabulary may be used here. Valid values are described at http://www.insdc.org/controlled-vocabulary-moltype-qualifier", category: "common" @@ -82,19 +70,39 @@ workflow genbank { } + scatter(segment_acc in reference_accessions) { + call ncbi_tools.fetch_genbank_metadata { + input: + genbank_accession = segment_acc + } + call ncbi.download_annotations { + input: + accessions = [segment_acc], + emailAddress = email_address, + combined_out_prefix = segment_acc + } + } + + call reports.coverage_report { + input: + mapped_bams = alignments_bams, + mapped_bam_idx = [] + } + call ncbi.biosample_to_genbank { input: biosample_attributes = biosample_attributes, - num_segments = length(reference_fastas), - taxid = taxid + num_segments = length(reference_accessions), + taxid = fetch_genbank_metadata.taxid[0], + s_dropout_note = false } scatter(assembly in assemblies_fasta) { call ncbi.align_and_annot_transfer_single as annot { input: genome_fasta = assembly, - reference_fastas = reference_fastas, - reference_feature_tables = reference_feature_tables + reference_fastas = flatten(download_annotations.genomes_fasta), + reference_feature_tables = flatten(download_annotations.features_tbl) } } @@ -112,10 +120,10 @@ workflow genbank { authors_sbt = generate_author_sbt.sbt_file, biosampleMap = biosample_to_genbank.biosample_map, genbankSourceTable = biosample_to_genbank.genbank_source_modifier_table, - coverage_table = coverage_table, + coverage_table = coverage_report.coverage_report, sequencingTech = sequencingTech, comment = comment, - organism = organism, + organism = fetch_genbank_metadata.organism[0], molType = molType } From cbd49eb4a97e39b5c8a718646afd6dbbda73f6ae Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 13:38:46 -0500 Subject: [PATCH 06/29] allow old coverage table optionally --- pipes/WDL/workflows/genbank.wdl | 16 ++++++++++------ test/input/WDL/test_inputs-genbank-local.json | 12 ++---------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl index 5d6a1bd99..5190141b8 100644 --- a/pipes/WDL/workflows/genbank.wdl +++ b/pipes/WDL/workflows/genbank.wdl @@ -15,9 +15,11 @@ workflow genbank { input { Array[File]+ assemblies_fasta - Array[File]+ alignments_bams Array[String]+ reference_accessions + Array[File] alignments_bams + File? coverage_table + String email_address # required for fetching data from NCBI APIs String author_list # of the form "Lastname,A.B., Lastname,C.,"; optional alternative to names in author_sbt_defaults_yaml File author_sbt_defaults_yaml # defaults to fill in for author_sbt file (including both author and non-author fields) @@ -83,10 +85,12 @@ workflow genbank { } } - call reports.coverage_report { - input: - mapped_bams = alignments_bams, - mapped_bam_idx = [] + if(length(alignments_bams)>0) { + call reports.coverage_report { + input: + mapped_bams = alignments_bams, + mapped_bam_idx = [] + } } call ncbi.biosample_to_genbank { @@ -120,7 +124,7 @@ workflow genbank { authors_sbt = generate_author_sbt.sbt_file, biosampleMap = biosample_to_genbank.biosample_map, genbankSourceTable = biosample_to_genbank.genbank_source_modifier_table, - coverage_table = coverage_report.coverage_report, + coverage_table = select_first([coverage_report.coverage_report, coverage_table]), sequencingTech = sequencingTech, comment = comment, organism = fetch_genbank_metadata.organism[0], diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json index b310a7ef3..9be3c48ba 100644 --- a/test/input/WDL/test_inputs-genbank-local.json +++ b/test/input/WDL/test_inputs-genbank-local.json @@ -1,13 +1,8 @@ { "genbank.molType": "cRNA", + "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", - "genbank.reference_feature_tables": [ - "test/input/genbank/KM821997.1.tbl", - "test/input/genbank/KM821998.1.tbl" - ], - "genbank.organism": "Lassa mammarenavirus", "genbank.sequencingTech": "Illumina MiSeq", - "genbank.taxid": 11620, "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt", "genbank.prep_genbank.assembly_method": "placeholder assembly software", "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1", @@ -20,9 +15,6 @@ "test/input/LASV_NGA_2018_0097.fasta", "test/input/LASV_NGA_2018_0541.fasta" ], - "genbank.reference_fastas": [ - "test/input/genbank/KM821997.1.fasta", - "test/input/genbank/KM821998.1.fasta" - ] + "genbank.email_address": "viral-ngs@broadinstitute.org" } From 5845dae9ff907183dc722f7bf8d16c344ec8db6f Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 15:02:46 -0500 Subject: [PATCH 07/29] update both input files --- .../cromwell-local/test_inputs-genbank-local.json | 10 ++-------- .../WDL/miniwdl-local/test_inputs-genbank-local.json | 12 ++---------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json index cdab9805b..907a8de33 100644 --- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json +++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json @@ -1,12 +1,8 @@ { "genbank.molType": "cRNA", "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt", - "genbank.reference_feature_tables": [ - "test/input/genbank/MN908947.3.tbl" - ], - "genbank.organism": "Severe acute respiratory syndrome coronavirus 2", + "genbank.reference_accessions": ["MN908947.3"], "genbank.sequencingTech": "Illumina NovaSeq", - "genbank.taxid": 2697049, "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt", "genbank.prep_genbank.assembly_method": "placeholder assembly software", "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1", @@ -19,8 +15,6 @@ "test/input/MA_MGH_00004.fasta", "test/input/MA_MGH_00005.fasta" ], - "genbank.reference_fastas": [ - "test/input/genbank/MN908947.3.fasta" - ] + "genbank.email_address": "viral-ngs@broadinstitute.org" } diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json index b310a7ef3..9be3c48ba 100644 --- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json +++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json @@ -1,13 +1,8 @@ { "genbank.molType": "cRNA", + "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", - "genbank.reference_feature_tables": [ - "test/input/genbank/KM821997.1.tbl", - "test/input/genbank/KM821998.1.tbl" - ], - "genbank.organism": "Lassa mammarenavirus", "genbank.sequencingTech": "Illumina MiSeq", - "genbank.taxid": 11620, "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt", "genbank.prep_genbank.assembly_method": "placeholder assembly software", "genbank.prep_genbank.assembly_method_version": "5.4.3.2.1", @@ -20,9 +15,6 @@ "test/input/LASV_NGA_2018_0097.fasta", "test/input/LASV_NGA_2018_0541.fasta" ], - "genbank.reference_fastas": [ - "test/input/genbank/KM821997.1.fasta", - "test/input/genbank/KM821998.1.fasta" - ] + "genbank.email_address": "viral-ngs@broadinstitute.org" } From 60825459fb5e2688b955f846e899b8689e6d60ad Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 16:39:27 -0500 Subject: [PATCH 08/29] test with empty input --- test/input/WDL/cromwell-local/test_inputs-genbank-local.json | 1 + test/input/WDL/miniwdl-local/test_inputs-genbank-local.json | 1 + test/input/WDL/test_inputs-genbank-local.json | 1 + 3 files changed, 3 insertions(+) diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json index 907a8de33..e6af063fe 100644 --- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json +++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json @@ -1,6 +1,7 @@ { "genbank.molType": "cRNA", "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt", + "genbank.alignments_bams": [], "genbank.reference_accessions": ["MN908947.3"], "genbank.sequencingTech": "Illumina NovaSeq", "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt", diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json index 9be3c48ba..1c938870e 100644 --- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json +++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json @@ -2,6 +2,7 @@ "genbank.molType": "cRNA", "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", + "genbank.alignments_bams": [], "genbank.sequencingTech": "Illumina MiSeq", "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt", "genbank.prep_genbank.assembly_method": "placeholder assembly software", diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json index 9be3c48ba..1c938870e 100644 --- a/test/input/WDL/test_inputs-genbank-local.json +++ b/test/input/WDL/test_inputs-genbank-local.json @@ -2,6 +2,7 @@ "genbank.molType": "cRNA", "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", + "genbank.alignments_bams": [], "genbank.sequencingTech": "Illumina MiSeq", "genbank.biosample_attributes": "test/input/genbank/biosample-attributes-lasv.txt", "genbank.prep_genbank.assembly_method": "placeholder assembly software", From 5ce0b814445a01e87c30ad9a2022e001eacc551c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 19:58:21 -0500 Subject: [PATCH 09/29] bugfixes --- pipes/WDL/tasks/tasks_ncbi.wdl | 1 + pipes/WDL/tasks/tasks_ncbi_tools.wdl | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl index 33b01e8b2..9d6324f01 100644 --- a/pipes/WDL/tasks/tasks_ncbi.wdl +++ b/pipes/WDL/tasks/tasks_ncbi.wdl @@ -54,6 +54,7 @@ task download_annotations { ./ \ ${sep=' ' accessions} \ --combinedFilePrefix "${combined_out_prefix}" \ + --forceOverwrite \ --loglevel DEBUG } diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl index 8605181dd..1bad0dff2 100644 --- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl +++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl @@ -154,12 +154,12 @@ task fetch_genbank_metadata { command <<< set -e esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json > gb.json - jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]| {(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > metadata.json - jq -r '.db_xref' meta.json | grep ^taxon: | cut -f 2 -d : > taxid.txt - jq -r '.organism' meta.json > organism.txt + jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json + jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt + jq -r '.organism' "~{genbank_accession}".metadata.json > organism.txt >>> output { - Map[String,String] metadata = read_json("metadata.json") + Map[String,String] metadata = read_json("~{genbank_accession}.metadata.json") String taxid = read_string("taxid.txt") String organism = read_string("organism.txt") } From da76be531710d4d8324fdbd0028d7bac7a135753 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Dec 2022 20:07:39 -0500 Subject: [PATCH 10/29] try other output --- pipes/WDL/workflows/genbank.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl index 5190141b8..600af4146 100644 --- a/pipes/WDL/workflows/genbank.wdl +++ b/pipes/WDL/workflows/genbank.wdl @@ -105,7 +105,7 @@ workflow genbank { call ncbi.align_and_annot_transfer_single as annot { input: genome_fasta = assembly, - reference_fastas = flatten(download_annotations.genomes_fasta), + reference_fastas = download_annotations.combined_fasta, reference_feature_tables = flatten(download_annotations.features_tbl) } } From 2341f07c5b6900ec8e9aa6ef0a4c5e1a3cd26aaa Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 5 Dec 2022 12:45:49 -0500 Subject: [PATCH 11/29] fix wdl task download_annotations file namespace error when combined_out_prefix happens to be an accession; also prevent combined fasta from showing up in genomes_fasta --- pipes/WDL/tasks/tasks_ncbi.wdl | 17 +++++++++-------- pipes/WDL/workflows/genbank.wdl | 3 ++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl index 9d6324f01..519a6aa6d 100644 --- a/pipes/WDL/tasks/tasks_ncbi.wdl +++ b/pipes/WDL/tasks/tasks_ncbi.wdl @@ -41,25 +41,26 @@ task download_annotations { String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" } - command { + command <<< set -ex -o pipefail ncbi.py --version | tee VERSION ncbi.py fetch_feature_tables \ - ${emailAddress} \ + ~{emailAddress} \ ./ \ - ${sep=' ' accessions} \ + ~{sep=' ' accessions} \ --loglevel DEBUG + mkdir -p combined ncbi.py fetch_fastas \ - ${emailAddress} \ + ~{emailAddress} \ ./ \ - ${sep=' ' accessions} \ - --combinedFilePrefix "${combined_out_prefix}" \ + ~{sep=' ' accessions} \ + --combinedFilePrefix "combined/~{combined_out_prefix}" \ --forceOverwrite \ --loglevel DEBUG - } + >>> output { - File combined_fasta = "${combined_out_prefix}.fasta" + File combined_fasta = "~{combined_out_prefix}.fasta" Array[File] genomes_fasta = glob("*.fasta") Array[File] features_tbl = glob("*.tbl") String viralngs_version = read_string("VERSION") diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl index 600af4146..40cda9a80 100644 --- a/pipes/WDL/workflows/genbank.wdl +++ b/pipes/WDL/workflows/genbank.wdl @@ -73,6 +73,7 @@ workflow genbank { } scatter(segment_acc in reference_accessions) { + # scatter these calls in order to preserve original order call ncbi_tools.fetch_genbank_metadata { input: genbank_accession = segment_acc @@ -105,7 +106,7 @@ workflow genbank { call ncbi.align_and_annot_transfer_single as annot { input: genome_fasta = assembly, - reference_fastas = download_annotations.combined_fasta, + reference_fastas = flatten(download_annotations.genomes_fasta), reference_feature_tables = flatten(download_annotations.features_tbl) } } From 93bd6bf938ea42613df4b69dd4c5f3c7f7a4a427 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 6 Dec 2022 18:43:52 +0100 Subject: [PATCH 12/29] test a fix for miniwdl --- pipes/WDL/tasks/tasks_ncbi_tools.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl index 1bad0dff2..00635105d 100644 --- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl +++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl @@ -153,6 +153,7 @@ task fetch_genbank_metadata { Int disk_size = 50 command <<< set -e + source activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json > gb.json jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt From 0d32530f8680a1ef39172fec1e3b6e91f324846e Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 6 Dec 2022 19:09:09 +0100 Subject: [PATCH 13/29] try again --- pipes/WDL/tasks/tasks_ncbi_tools.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl index 00635105d..5361a63fc 100644 --- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl +++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl @@ -153,7 +153,7 @@ task fetch_genbank_metadata { Int disk_size = 50 command <<< set -e - source activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners + activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json > gb.json jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt From 4f7464d870c9f98c397bc45895ed1b4ca5d54f1c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Dec 2022 09:06:23 +0100 Subject: [PATCH 14/29] try activaet conda again --- pipes/WDL/tasks/tasks_ncbi_tools.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl index 5361a63fc..64d0328db 100644 --- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl +++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl @@ -153,7 +153,7 @@ task fetch_genbank_metadata { Int disk_size = 50 command <<< set -e - activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners + source /opt/miniconda/bin/activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json > gb.json jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt From 61d9a41f980e573aeb7a485bdd6c56db50b25aba Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Dec 2022 14:06:27 -0500 Subject: [PATCH 15/29] make assemble_denovo accept multiple input bams --- pipes/WDL/workflows/assemble_denovo.wdl | 15 ++++++++++++--- .../test_inputs-assemble_denovo-dnanexus.dx.json | 2 +- .../WDL/test_inputs-assemble_denovo-local.json | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl index 7ba5b5be1..4b1fe6dff 100644 --- a/pipes/WDL/workflows/assemble_denovo.wdl +++ b/pipes/WDL/workflows/assemble_denovo.wdl @@ -15,7 +15,7 @@ workflow assemble_denovo { } input { - File reads_unmapped_bam + Array[File]+ reads_unmapped_bams Array[File]+ reference_genome_fasta @@ -26,11 +26,11 @@ workflow assemble_denovo { File? filter_to_taxon_db File trim_clip_db - String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned") + String sample_name = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned") } parameter_meta { - raw_reads_unmapped_bam: { description: "unaligned reads in BAM format", patterns: ["*.bam"] } + raw_reads_unmapped_bams: { description: "unaligned reads in BAM format", patterns: ["*.bam"] } deplete_bmtaggerDbs: { description: "Optional list of databases to use for bmtagger-based depletion. Sequences in fasta format will be indexed on the fly, pre-bmtagger-indexed databases may be provided as tarballs.", patterns: ["*.fasta", "*.fasta.gz", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"] @@ -53,6 +53,15 @@ workflow assemble_denovo { } } + if(length(reads_unmapped_bams)>1) { + call read_utils.merge_and_reheader_bams as merge_reads { + input: + in_bams = reads_unmapped_bams, + out_basename = sample_name + } + } + File reads_unmapped_bam = select_first([merge_reads.out_bam, reads_unmapped_bams[0]]) + if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) { call taxon_filter.deplete_taxa { input: diff --git a/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json b/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json index 8bf8b18e1..c182224e0 100644 --- a/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json +++ b/test/input/WDL/test_inputs-assemble_denovo-dnanexus.dx.json @@ -1,5 +1,5 @@ { - "stage-common.reads_unmapped_bam": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F8F2kVQ09y3Q9Qj14fF806q2" } }, + "stage-common.reads_unmapped_bams": [ { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F8F2kVQ09y3Q9Qj14fF806q2" } } ], "stage-common.deplete_bmtaggerDbs": [ { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-BYF6g8Q0zjF77x79bGYgJ1Zb" } } diff --git a/test/input/WDL/test_inputs-assemble_denovo-local.json b/test/input/WDL/test_inputs-assemble_denovo-local.json index bf4c80a0b..b5c46b913 100644 --- a/test/input/WDL/test_inputs-assemble_denovo-local.json +++ b/test/input/WDL/test_inputs-assemble_denovo-local.json @@ -1,5 +1,5 @@ { - "assemble_denovo.reads_unmapped_bam": "test/input/G5012.3.testreads.bam", + "assemble_denovo.reads_unmapped_bams": ["test/input/G5012.3.testreads.bam"], "assemble_denovo.deplete_blastDbs": [ "test/input/5kb_human_from_chr6.fasta" ], From 0453414188f3c1d549eca22f01a8fec0154a52ca Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Dec 2022 12:47:40 -0500 Subject: [PATCH 16/29] add unique_strings and unique_arrays to tasks_utils --- pipes/WDL/tasks/tasks_utils.wdl | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 22d8f8e20..3e5701918 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -586,6 +586,50 @@ task rename_file { } } +task unique_strings { + input { + Array[String] strings + } + Int disk_size = 50 + command { + cat ~{write_lines(strings)} | sort | uniq > UNIQUE_OUT + } + output { + Array[String] sorted_unique = read_lines("UNIQUE_OUT") + } + runtime { + memory: "1 GB" + cpu: 1 + docker: "ubuntu" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" # TES + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 2 + } +} + +task unique_arrays { + input { + Array[Array[String]] string_arrays + } + Int disk_size = 50 + command { + cat ~{write_tsv(string_arrays)} | sort | uniq > UNIQUE_OUT + } + output { + Array[Array[String]] sorted_unique = read_tsv("UNIQUE_OUT") + } + runtime { + memory: "1 GB" + cpu: 1 + docker: "ubuntu" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" # TES + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 2 + } +} + task today { input { String? timezone From f3f4ec8b1dff8febf5432296a9b03ef71e1e1bd8 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Dec 2022 13:00:58 -0500 Subject: [PATCH 17/29] add fixed string assembly_method output --- pipes/WDL/workflows/assemble_denovo.wdl | 1 + pipes/WDL/workflows/assemble_refbased.wdl | 1 + 2 files changed, 2 insertions(+) diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl index 4b1fe6dff..569af71bb 100644 --- a/pipes/WDL/workflows/assemble_denovo.wdl +++ b/pipes/WDL/workflows/assemble_denovo.wdl @@ -157,6 +157,7 @@ workflow assemble_denovo { Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned Float bases_aligned = refine.align_to_self_merged_bases_aligned + String assembly_method = "viral-ngs/assemble_denovo" String? deplete_viral_classify_version = deplete_taxa.viralngs_version String? taxfilt_viral_classify_version = filter_to_taxon.viralngs_version String assemble_viral_assemble_version = assemble.viralngs_version diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 00ee8a445..06069f746 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -237,6 +237,7 @@ workflow assemble_refbased { Float align_to_self_merged_mean_coverage = plot_self_coverage.mean_coverage File align_to_self_isnvs_vcf = isnvs_self.report_vcf + String assembly_method = "viral-ngs/assemble_refbased" String align_to_ref_viral_core_version = align_to_ref.viralngs_version[0] String ivar_version = ivar_trim.ivar_version[0] String viral_assemble_version = call_consensus.viralngs_version From 59f1a0eceb6df62667f578f961b16bce0b936569 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Dec 2022 13:08:27 -0500 Subject: [PATCH 18/29] convert reference_accessions to reference_accessions_list which is amenable to accepting the output of scaffolding_chosen_ref as input --- pipes/WDL/tasks/tasks_utils.wdl | 20 ++++++++++++++++++++ pipes/WDL/workflows/genbank.wdl | 24 ++++++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 3e5701918..f0c5f07e0 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -586,6 +586,26 @@ task rename_file { } } +task raise { + input { + String message = "error!" + } + command { + set -e + echo "$message" + exit 1 + } + runtime { + memory: "1 GB" + cpu: 1 + docker: "ubuntu" + disks: "local-disk 30 HDD" + disk: "30 GB" # TES + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 2 + } +} + task unique_strings { input { Array[String] strings diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl index 40cda9a80..55792cb56 100644 --- a/pipes/WDL/workflows/genbank.wdl +++ b/pipes/WDL/workflows/genbank.wdl @@ -3,6 +3,7 @@ version 1.0 import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools import "../tasks/tasks_reports.wdl" as reports +import "../tasks/tasks_utils.wdl" as utils workflow genbank { @@ -14,8 +15,8 @@ workflow genbank { } input { - Array[File]+ assemblies_fasta - Array[String]+ reference_accessions + Array[File]+ assemblies_fasta + Array[Array[String]]+ reference_accessions_list Array[File] alignments_bams File? coverage_table @@ -35,8 +36,8 @@ workflow genbank { description: "Genomes to prepare for Genbank submission. One file per genome: all segments/chromosomes included in one file. All fasta files must contain exactly the same number of sequences as reference_fasta (which must equal the number of files in reference_annot_tbl).", patterns: ["*.fasta"] } - reference_accessions: { - description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in genome_fasta.", + reference_accessions_list: { + description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in assemblies_fasta. This is allowed to be an Array of such accession lists, but if so, all Array[String]s must be identical to each other or an error will be raised.", patterns: ["*.fasta"] } author_list: { @@ -72,6 +73,20 @@ workflow genbank { } + # take a array-array-string of scaffolding_chosen_refs output -> uniqified list of reference_accessions -- fail if not exactly one unique array-string across all array-array-strings + call utils.unique_arrays as unique_references { + input: + string_arrays = reference_accessions_list + } + if((length(unique_references.sorted_unique) != 1) + || length(unique_references.sorted_unique[0]) < 1) { + call utils.raise { + input: + message = "all Array[String] reference accession lists in reference_accessions_list must be identical!" + } + } + Array[String] reference_accessions = unique_references.sorted_unique[0] + scatter(segment_acc in reference_accessions) { # scatter these calls in order to preserve original order call ncbi_tools.fetch_genbank_metadata { @@ -94,6 +109,7 @@ workflow genbank { } } + # TO DO dpark: if ! defined biosample_attributes, call ncbi_tools.fetch_biosamples on external ids (where do we get external ids?) call ncbi.biosample_to_genbank { input: biosample_attributes = biosample_attributes, From 872493022df07f5b2738a379538cc603b105bcd5 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Dec 2022 13:45:21 -0500 Subject: [PATCH 19/29] fix test input jsons --- test/input/WDL/cromwell-local/test_inputs-genbank-local.json | 2 +- test/input/WDL/miniwdl-local/test_inputs-genbank-local.json | 2 +- test/input/WDL/test_inputs-genbank-local.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json index e6af063fe..0d0614d01 100644 --- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json +++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json @@ -2,7 +2,7 @@ "genbank.molType": "cRNA", "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt", "genbank.alignments_bams": [], - "genbank.reference_accessions": ["MN908947.3"], + "genbank.reference_accessions_list": [["MN908947.3"],["MN908947.3"],["MN908947.3"]], "genbank.sequencingTech": "Illumina NovaSeq", "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt", "genbank.prep_genbank.assembly_method": "placeholder assembly software", diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json index 1c938870e..b5edf88d4 100644 --- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json +++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json @@ -1,6 +1,6 @@ { "genbank.molType": "cRNA", - "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], + "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", "genbank.alignments_bams": [], "genbank.sequencingTech": "Illumina MiSeq", diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json index 1c938870e..b5edf88d4 100644 --- a/test/input/WDL/test_inputs-genbank-local.json +++ b/test/input/WDL/test_inputs-genbank-local.json @@ -1,6 +1,6 @@ { "genbank.molType": "cRNA", - "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], + "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", "genbank.alignments_bams": [], "genbank.sequencingTech": "Illumina MiSeq", From 1c3d95fa03305fee14734bdeb5504bac31644d23 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Dec 2022 15:11:18 -0500 Subject: [PATCH 20/29] revert recent change --- pipes/WDL/workflows/genbank.wdl | 24 ++++--------------- .../test_inputs-genbank-local.json | 2 +- .../test_inputs-genbank-local.json | 2 +- test/input/WDL/test_inputs-genbank-local.json | 2 +- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl index 55792cb56..ce12e38b6 100644 --- a/pipes/WDL/workflows/genbank.wdl +++ b/pipes/WDL/workflows/genbank.wdl @@ -3,7 +3,7 @@ version 1.0 import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools import "../tasks/tasks_reports.wdl" as reports -import "../tasks/tasks_utils.wdl" as utils +#import "../tasks/tasks_utils.wdl" as utils workflow genbank { @@ -15,8 +15,8 @@ workflow genbank { } input { - Array[File]+ assemblies_fasta - Array[Array[String]]+ reference_accessions_list + Array[File]+ assemblies_fasta + Array[String]+ reference_accessions Array[File] alignments_bams File? coverage_table @@ -36,8 +36,8 @@ workflow genbank { description: "Genomes to prepare for Genbank submission. One file per genome: all segments/chromosomes included in one file. All fasta files must contain exactly the same number of sequences as reference_fasta (which must equal the number of files in reference_annot_tbl).", patterns: ["*.fasta"] } - reference_accessions_list: { - description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in assemblies_fasta. This is allowed to be an Array of such accession lists, but if so, all Array[String]s must be identical to each other or an error will be raised.", + reference_accessions: { + description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in assemblies_fasta.", patterns: ["*.fasta"] } author_list: { @@ -73,20 +73,6 @@ workflow genbank { } - # take a array-array-string of scaffolding_chosen_refs output -> uniqified list of reference_accessions -- fail if not exactly one unique array-string across all array-array-strings - call utils.unique_arrays as unique_references { - input: - string_arrays = reference_accessions_list - } - if((length(unique_references.sorted_unique) != 1) - || length(unique_references.sorted_unique[0]) < 1) { - call utils.raise { - input: - message = "all Array[String] reference accession lists in reference_accessions_list must be identical!" - } - } - Array[String] reference_accessions = unique_references.sorted_unique[0] - scatter(segment_acc in reference_accessions) { # scatter these calls in order to preserve original order call ncbi_tools.fetch_genbank_metadata { diff --git a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json index 0d0614d01..e6af063fe 100644 --- a/test/input/WDL/cromwell-local/test_inputs-genbank-local.json +++ b/test/input/WDL/cromwell-local/test_inputs-genbank-local.json @@ -2,7 +2,7 @@ "genbank.molType": "cRNA", "genbank.coverage_table": "test/input/genbank/coverage-ma_mgh.txt", "genbank.alignments_bams": [], - "genbank.reference_accessions_list": [["MN908947.3"],["MN908947.3"],["MN908947.3"]], + "genbank.reference_accessions": ["MN908947.3"], "genbank.sequencingTech": "Illumina NovaSeq", "genbank.biosample_attributes": "test/input/genbank/sars-cov-2_attributes_updated.txt", "genbank.prep_genbank.assembly_method": "placeholder assembly software", diff --git a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json index b5edf88d4..1c938870e 100644 --- a/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json +++ b/test/input/WDL/miniwdl-local/test_inputs-genbank-local.json @@ -1,6 +1,6 @@ { "genbank.molType": "cRNA", - "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]], + "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", "genbank.alignments_bams": [], "genbank.sequencingTech": "Illumina MiSeq", diff --git a/test/input/WDL/test_inputs-genbank-local.json b/test/input/WDL/test_inputs-genbank-local.json index b5edf88d4..1c938870e 100644 --- a/test/input/WDL/test_inputs-genbank-local.json +++ b/test/input/WDL/test_inputs-genbank-local.json @@ -1,6 +1,6 @@ { "genbank.molType": "cRNA", - "genbank.reference_accessions_list": [["KM821997.1", "KM821998.1"]], + "genbank.reference_accessions": ["KM821997.1", "KM821998.1"], "genbank.coverage_table": "test/input/genbank/coverage_report-RUN1-workflow.txt", "genbank.alignments_bams": [], "genbank.sequencingTech": "Illumina MiSeq", From 6b7ac475bd23c27c39b5552f5d30c843e8e38d6d Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 10 Dec 2022 16:06:45 -0500 Subject: [PATCH 21/29] ironically the prep_genbank only wants a 2-col coverage table --- pipes/WDL/workflows/genbank.wdl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl index ce12e38b6..b2ae25c11 100644 --- a/pipes/WDL/workflows/genbank.wdl +++ b/pipes/WDL/workflows/genbank.wdl @@ -3,7 +3,7 @@ version 1.0 import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools import "../tasks/tasks_reports.wdl" as reports -#import "../tasks/tasks_utils.wdl" as utils +import "../tasks/tasks_utils.wdl" as utils workflow genbank { @@ -93,6 +93,11 @@ workflow genbank { mapped_bams = alignments_bams, mapped_bam_idx = [] } + call utils.tsv_drop_cols as coverage_two_col { + input: + in_tsv = coverage_report.coverage_report, + drop_cols = ["aln2self_cov_median", "aln2self_cov_mean_non0", "aln2self_cov_1X", "aln2self_cov_5X", "aln2self_cov_20X", "aln2self_cov_100X"] + } } # TO DO dpark: if ! defined biosample_attributes, call ncbi_tools.fetch_biosamples on external ids (where do we get external ids?) @@ -127,7 +132,7 @@ workflow genbank { authors_sbt = generate_author_sbt.sbt_file, biosampleMap = biosample_to_genbank.biosample_map, genbankSourceTable = biosample_to_genbank.genbank_source_modifier_table, - coverage_table = select_first([coverage_report.coverage_report, coverage_table]), + coverage_table = select_first([coverage_two_col.out_tsv, coverage_table]), sequencingTech = sequencingTech, comment = comment, organism = fetch_genbank_metadata.organism[0], From a581f4d02d8f7166a91502eed6ea40f1f6abe679 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sun, 11 Dec 2022 23:18:48 -0500 Subject: [PATCH 22/29] add a place for non-filename-friendly sample names to come in and rewrite bam headers and fasta headers --- pipes/WDL/workflows/assemble_denovo.wdl | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl index 569af71bb..c202af262 100644 --- a/pipes/WDL/workflows/assemble_denovo.wdl +++ b/pipes/WDL/workflows/assemble_denovo.wdl @@ -3,6 +3,7 @@ version 1.0 import "../tasks/tasks_taxon_filter.wdl" as taxon_filter import "../tasks/tasks_read_utils.wdl" as read_utils import "../tasks/tasks_assembly.wdl" as assembly +import "../tasks/tasks_ncbi.wdl" as ncbi import "assemble_refbased.wdl" as assemble_refbased workflow assemble_denovo { @@ -27,6 +28,7 @@ workflow assemble_denovo { File trim_clip_db String sample_name = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned") + String? sample_original_name } parameter_meta { @@ -53,10 +55,11 @@ workflow assemble_denovo { } } - if(length(reads_unmapped_bams)>1) { + if(length(reads_unmapped_bams)>1 || defined(sample_original_name)) { call read_utils.merge_and_reheader_bams as merge_reads { input: in_bams = reads_unmapped_bams, + sample_name = sample_original_name, out_basename = sample_name } } @@ -107,8 +110,16 @@ workflow assemble_denovo { sample_name = sample_name } + if (defined(sample_original_name)) { + call ncbi.rename_fasta_header { + input: + genome_fasta = refine.assembly_fasta, + new_name = select_first([sample_original_name]) + } + } + output { - File final_assembly_fasta = refine.assembly_fasta + File final_assembly_fasta = select_first([rename_fasta_header.renamed_fasta, refine.assembly_fasta]) File aligned_only_reads_bam = refine.align_to_self_merged_aligned_only_bam File coverage_plot = refine.align_to_self_merged_coverage_plot Int assembly_length = refine.assembly_length From 30b2490188ae778f14b4db98d1ab25060c247922 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 12 Dec 2022 09:40:07 -0500 Subject: [PATCH 23/29] refashion assemble_denovo to parallelize most of its pre-assembly steps --- pipes/WDL/tasks/tasks_read_utils.wdl | 40 ++++--- pipes/WDL/workflows/assemble_denovo.wdl | 109 +++++++++++------- pipes/WDL/workflows/assemble_refbased.wdl | 4 +- .../test_inputs-assemble_denovo-local.json | 3 +- .../test_outputs-assemble_denovo-local.json | 3 +- 5 files changed, 97 insertions(+), 62 deletions(-) diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl index 612619fd7..46dac190e 100644 --- a/pipes/WDL/tasks/tasks_read_utils.wdl +++ b/pipes/WDL/tasks/tasks_read_utils.wdl @@ -151,45 +151,53 @@ task merge_and_reheader_bams { Int disk_size = 750 - command { + command <<< set -ex -o pipefail read_utils.py --version | tee VERSION mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) - if [ ${length(in_bams)} -gt 1 ]; then - read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --JVMmemory="$mem_in_mb"m --loglevel DEBUG + if [ ~{length(in_bams)} -gt 1 ]; then + read_utils.py merge_bams ~{sep=' ' in_bams} merged.bam --JVMmemory="$mem_in_mb"m --loglevel DEBUG else echo "Skipping merge, only one input file" - cp ${sep=' ' in_bams} merged.bam + cp ~{sep=' ' in_bams} merged.bam fi # remap all SM values to user specified value - if [ -n "${sample_name}" ]; then + if [ -n "~{sample_name}" ]; then # create sample name remapping table based on existing sample names - samtools view -H merged.bam | perl -n -e'/SM:(\S+)/ && print "SM\t$1\t'"${sample_name}"'\n"' | sort | uniq >> reheader_table.txt + samtools view -H merged.bam | perl -n -e'/SM:(\S+)/ && print "SM\t$1\t'"~{sample_name}"'\n"' | sort | uniq >> reheader_table.txt fi # remap arbitrary headers using user specified table - if [[ -f "${reheader_table}" ]]; then - cat "${reheader_table}" >> reheader_table.txt + if [[ -f "~{reheader_table}" ]]; then + cat "~{reheader_table}" >> reheader_table.txt fi # reheader bam file if requested if [ -s reheader_table.txt ]; then - read_utils.py reheader_bam merged.bam reheader_table.txt "${out_basename}.bam" --loglevel DEBUG + read_utils.py reheader_bam merged.bam reheader_table.txt "~{out_basename}.bam" --loglevel DEBUG else - mv merged.bam "${out_basename}.bam" + mv merged.bam "~{out_basename}.bam" fi - } + + # summary stats on merged output + samtools view -c "~{out_basename}.bam" | tee read_count_merged + samtools flagstat "~{out_basename}.bam" | tee "~{out_basename}.bam.flagstat.txt" + reports.py fastqc "~{out_basename}.bam" "~{out_basename}.fastqc.html" + >>> output { - File out_bam = "${out_basename}.bam" + File out_bam = "~{out_basename}.bam" + Int read_count = read_int("read_count_merged") + File flagstat = "~{out_basename}.bam.flagstat.txt" + File fastqc = "~{out_basename}.fastqc.html" String viralngs_version = read_string("VERSION") } runtime { - docker: "${docker}" + docker: docker memory: "3 GB" cpu: 2 disks: "local-disk " + disk_size + " LOCAL" @@ -210,7 +218,7 @@ task rmdup_ubam { String method = "mvicuna" Int? machine_mem_gb - String? docker = "quay.io/broadinstitute/viral-core:2.1.33" + String docker = "quay.io/broadinstitute/viral-core:2.1.33" } Int disk_size = 375 @@ -246,7 +254,7 @@ task rmdup_ubam { } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 7]) + " GB" cpu: 2 disks: "local-disk " + disk_size + " LOCAL" @@ -306,7 +314,7 @@ task downsample_bams { } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 3]) + " GB" cpu: 4 disks: "local-disk " + disk_size + " LOCAL" diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl index c202af262..8189d50c2 100644 --- a/pipes/WDL/workflows/assemble_denovo.wdl +++ b/pipes/WDL/workflows/assemble_denovo.wdl @@ -27,7 +27,7 @@ workflow assemble_denovo { File? filter_to_taxon_db File trim_clip_db - String sample_name = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned") + String out_basename = basename(basename(reads_unmapped_bams[0], ".bam"), ".cleaned") String? sample_original_name } @@ -53,61 +53,91 @@ workflow assemble_denovo { description: "After denovo assembly, large contigs are scaffolded against a reference genome to determine orientation and to join contigs together, before further polishing by reads. You must supply at least one reference genome (all segments/chromomes in a single fasta file). If more than one reference is provided, contigs will be scaffolded against all of them and the one with the most complete assembly will be chosen for downstream polishing.", patterns: ["*.fasta"] } + out_basename: { description: "a filename-friendly basename for output files" } + sample_original_name: { description: "a (possibly filename-unfriendly) sample name for fasta and bam headers" } } - if(length(reads_unmapped_bams)>1 || defined(sample_original_name)) { - call read_utils.merge_and_reheader_bams as merge_reads { + # parallelize across provided input read files + scatter(reads_unmapped_bam in reads_unmapped_bams) { + + # rename SM value in bam header if requested + if(defined(sample_original_name)) { + call read_utils.merge_and_reheader_bams as renamed_reads { input: - in_bams = reads_unmapped_bams, + in_bams = [reads_unmapped_bam], sample_name = sample_original_name, - out_basename = sample_name + out_basename = out_basename } - } - File reads_unmapped_bam = select_first([merge_reads.out_bam, reads_unmapped_bams[0]]) + } + File reads_unmapped_renamed_bams = select_first([renamed_reads.out_bam, reads_unmapped_bam]) + + # deplete host if requested + if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) { + call taxon_filter.deplete_taxa { + input: + raw_reads_unmapped_bam = reads_unmapped_renamed_bams, + bmtaggerDbs = deplete_bmtaggerDbs, + blastDbs = deplete_blastDbs, + bwaDbs = deplete_bwaDbs + } + } + File reads_depleted_bams = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam]) + + # select reads if requested + if(defined(filter_to_taxon_db)) { + call taxon_filter.filter_to_taxon { + input: + reads_unmapped_bam = reads_depleted_bams, + lastal_db_fasta = select_first([filter_to_taxon_db]) + } + } + File reads_taxfilt_bams = select_first([filter_to_taxon.taxfilt_bam, reads_depleted_bams]) - if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) { - call taxon_filter.deplete_taxa { + # alignment-free PCR duplicate removal + call read_utils.rmdup_ubam { input: - raw_reads_unmapped_bam = reads_unmapped_bam, - bmtaggerDbs = deplete_bmtaggerDbs, - blastDbs = deplete_blastDbs, - bwaDbs = deplete_bwaDbs + reads_unmapped_bam = reads_taxfilt_bams } } - if(defined(filter_to_taxon_db)) { - call taxon_filter.filter_to_taxon { + # merge all reads into single file + call read_utils.merge_and_reheader_bams as merge_dedup_reads { input: - reads_unmapped_bam = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam]), - lastal_db_fasta = select_first([filter_to_taxon_db]) - } + in_bams = rmdup_ubam.dedup_bam, + out_basename = out_basename } - - call read_utils.rmdup_ubam { - input: - reads_unmapped_bam = select_first([filter_to_taxon.taxfilt_bam, deplete_taxa.cleaned_bam, reads_unmapped_bam]) + call read_utils.merge_and_reheader_bams as merge_cleaned_reads { + input: + in_bams = reads_depleted_bams, + out_basename = out_basename + } + call read_utils.merge_and_reheader_bams as merge_taxfilt_reads { + input: + in_bams = reads_taxfilt_bams, + out_basename = out_basename } + # denovo assembly pipeline below call assembly.assemble { input: - reads_unmapped_bam = rmdup_ubam.dedup_bam, + reads_unmapped_bam = merge_dedup_reads.out_bam, trim_clip_db = trim_clip_db, always_succeed = true, - sample_name = sample_name + sample_name = out_basename } call assembly.scaffold { input: contigs_fasta = assemble.contigs_fasta, - reads_bam = select_first([filter_to_taxon.taxfilt_bam, deplete_taxa.cleaned_bam, reads_unmapped_bam]), + reads_bam = merge_dedup_reads.out_bam, reference_genome_fasta = reference_genome_fasta } call assemble_refbased.assemble_refbased as refine { input: - reads_unmapped_bams = [rmdup_ubam.dedup_bam], + reads_unmapped_bams = reads_depleted_bams, # assemble_refbased will scatter on individual bams reference_fasta = scaffold.scaffold_fasta, - sample_name = sample_name + sample_name = out_basename } if (defined(sample_original_name)) { @@ -127,18 +157,17 @@ workflow assemble_denovo { Int reads_aligned = refine.align_to_self_merged_reads_aligned Float mean_coverage = refine.align_to_self_merged_mean_coverage - File cleaned_bam = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam]) - File? cleaned_fastqc = deplete_taxa.cleaned_fastqc - Int? depletion_read_count_pre = deplete_taxa.depletion_read_count_pre - Int? depletion_read_count_post = deplete_taxa.depletion_read_count_post + File cleaned_bam = merge_cleaned_reads.out_bam + File cleaned_fastqc = merge_cleaned_reads.fastqc + Int depletion_read_count_post = merge_cleaned_reads.read_count - File? taxfilt_bam = filter_to_taxon.taxfilt_bam - File? taxfilt_fastqc = filter_to_taxon.taxfilt_fastqc - Int? filter_read_count_post = filter_to_taxon.filter_read_count_post + File taxfilt_bam = merge_taxfilt_reads.out_bam + File taxfilt_fastqc = merge_taxfilt_reads.fastqc + Int filter_read_count_post = merge_taxfilt_reads.read_count - File dedup_bam = rmdup_ubam.dedup_bam - File dedup_fastqc = rmdup_ubam.dedup_fastqc - Int dedup_read_count_post = rmdup_ubam.dedup_read_count_post + File dedup_bam = merge_dedup_reads.out_bam + File dedup_fastqc = merge_dedup_reads.fastqc + Int dedup_read_count_post = merge_dedup_reads.read_count File contigs_fasta = assemble.contigs_fasta File subsampBam = assemble.subsampBam @@ -162,15 +191,13 @@ workflow assemble_denovo { File isnvs_vcf = refine.align_to_self_isnvs_vcf - File aligned_bam = refine.align_to_self_merged_aligned_and_unaligned_bam[0] - File aligned_only_reads_fastqc = refine.align_to_ref_per_input_fastqc[0] + File aligned_bam = refine.align_to_self_merged_aligned_only_bam + File aligned_only_reads_fastqc = refine.align_to_ref_fastqc File coverage_tsv = refine.align_to_self_merged_coverage_tsv Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned Float bases_aligned = refine.align_to_self_merged_bases_aligned String assembly_method = "viral-ngs/assemble_denovo" - String? deplete_viral_classify_version = deplete_taxa.viralngs_version - String? taxfilt_viral_classify_version = filter_to_taxon.viralngs_version String assemble_viral_assemble_version = assemble.viralngs_version String scaffold_viral_assemble_version = scaffold.viralngs_version } diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 06069f746..3001f9dfb 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -210,9 +210,9 @@ workflow assemble_refbased { Array[File] align_to_ref_per_input_aligned_flagstat = align_to_ref.aligned_bam_flagstat Array[Int] align_to_ref_per_input_reads_provided = align_to_ref.reads_provided Array[Int] align_to_ref_per_input_reads_aligned = align_to_ref.reads_aligned - Array[File] align_to_ref_per_input_fastqc = align_to_ref.aligned_only_reads_fastqc - + File align_to_ref_merged_aligned_trimmed_only_bam = aligned_trimmed_bam + File align_to_ref_fastqc = select_first([merge_align_to_ref.fastqc, align_to_ref.aligned_only_reads_fastqc[0]]) File align_to_ref_merged_coverage_plot = plot_ref_coverage.coverage_plot File align_to_ref_merged_coverage_tsv = plot_ref_coverage.coverage_tsv Int align_to_ref_merged_reads_aligned = plot_ref_coverage.reads_aligned diff --git a/test/input/WDL/test_inputs-assemble_denovo-local.json b/test/input/WDL/test_inputs-assemble_denovo-local.json index b5c46b913..e352b2713 100644 --- a/test/input/WDL/test_inputs-assemble_denovo-local.json +++ b/test/input/WDL/test_inputs-assemble_denovo-local.json @@ -5,5 +5,6 @@ ], "assemble_denovo.filter_to_taxon_db": "test/input/ebov-makona.fasta", "assemble_denovo.trim_clip_db": "test/input/clipDb.fasta", - "assemble_denovo.reference_genome_fasta": ["test/input/ebov-makona.fasta"] + "assemble_denovo.reference_genome_fasta": ["test/input/ebov-makona.fasta"], + "assemble_denovo.sample_original_name": "USA/ this ? is an un-friendly sample_name! / 1999" } diff --git a/test/input/WDL/test_outputs-assemble_denovo-local.json b/test/input/WDL/test_outputs-assemble_denovo-local.json index a1a502cf1..485bd6a39 100644 --- a/test/input/WDL/test_outputs-assemble_denovo-local.json +++ b/test/input/WDL/test_outputs-assemble_denovo-local.json @@ -8,6 +8,5 @@ "assemble_denovo.assembly_length_unambiguous": 18843, "assemble_denovo.assembly_length": 18843, "assemble_denovo.filter_read_count_post": 18710, - "assemble_denovo.depletion_read_count_post": 18710, - "assemble_denovo.depletion_read_count_pre": 18710 + "assemble_denovo.depletion_read_count_post": 18710 } From f919c64af18c8f42c68b2acb6c6f9054cc51695a Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 12 Dec 2022 09:45:57 -0500 Subject: [PATCH 24/29] fix in metagenomic_denovo --- pipes/WDL/workflows/metagenomic_denovo.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/metagenomic_denovo.wdl b/pipes/WDL/workflows/metagenomic_denovo.wdl index f73a9aa4b..bebf5db73 100644 --- a/pipes/WDL/workflows/metagenomic_denovo.wdl +++ b/pipes/WDL/workflows/metagenomic_denovo.wdl @@ -248,8 +248,8 @@ workflow metagenomic_denovo { File isnvs_vcf = refine.align_to_self_isnvs_vcf - File aligned_bam = refine.align_to_self_merged_aligned_and_unaligned_bam[0] - File aligned_only_reads_fastqc = refine.align_to_ref_per_input_fastqc[0] + File aligned_bam = refine.align_to_self_merged_aligned_only_bam + File aligned_only_reads_fastqc = refine.align_to_ref_fastqc File coverage_tsv = refine.align_to_self_merged_coverage_tsv Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned Float bases_aligned = refine.align_to_self_merged_bases_aligned From 3aaadbf81628ccfe9c591ff127a36499c5123442 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 12 Dec 2022 09:49:40 -0500 Subject: [PATCH 25/29] fix scaffold_and_refine --- pipes/WDL/workflows/scaffold_and_refine.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl index adb4411df..57926284c 100644 --- a/pipes/WDL/workflows/scaffold_and_refine.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine.wdl @@ -53,8 +53,8 @@ workflow scaffold_and_refine { File isnvsFile = refine.align_to_self_isnvs_vcf - File aligned_bam = refine.align_to_self_merged_aligned_and_unaligned_bam[0] - File aligned_only_reads_fastqc = refine.align_to_ref_per_input_fastqc[0] + File aligned_bam = refine.align_to_self_merged_aligned_only_bam + File aligned_only_reads_fastqc = refine.align_to_ref_fastqc File coverage_tsv = refine.align_to_self_merged_coverage_tsv Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned Float bases_aligned = refine.align_to_self_merged_bases_aligned From 670dafaa9ee8e9fd781bad685beb93feb3e0299c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 12 Dec 2022 11:03:57 -0500 Subject: [PATCH 26/29] bump viral-assemble and viral-classify docker tags --- pipes/WDL/tasks/tasks_assembly.wdl | 8 ++++---- pipes/WDL/tasks/tasks_metagenomics.wdl | 16 ++++++++-------- pipes/WDL/tasks/tasks_reports.wdl | 4 ++-- pipes/WDL/tasks/tasks_taxon_filter.wdl | 6 +++--- requirements-modules.txt | 4 ++-- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 8ef66a37f..e1b20dc8b 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -15,7 +15,7 @@ task assemble { String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt") Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2" + String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" } Int disk_size = 375 @@ -81,7 +81,7 @@ task scaffold { Float? scaffold_min_pct_contig_aligned Int? machine_mem_gb - String docker="quay.io/broadinstitute/viral-assemble:2.1.20.2" + String docker="quay.io/broadinstitute/viral-assemble:2.1.33.0" # do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades") @@ -424,7 +424,7 @@ task refine_assembly_with_aligned_reads { Int min_coverage = 3 Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2" + String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" } Int disk_size = 375 @@ -534,7 +534,7 @@ task refine_2x_and_plot { String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2" + String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" # do this in two steps in case the input doesn't actually have "cleaned" in the name String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned") diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index add2746cc..d9b806193 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -11,7 +11,7 @@ task krakenuniq { File krona_taxonomy_db_tgz # taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } Int disk_size = 750 @@ -140,7 +140,7 @@ task build_krakenuniq_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } Int disk_size = 750 @@ -210,7 +210,7 @@ task kraken2 { Int? min_base_qual Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } parameter_meta { @@ -345,7 +345,7 @@ task build_kraken2_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } Int disk_size = 750 @@ -487,7 +487,7 @@ task blastx { File krona_taxonomy_db_tgz Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } parameter_meta { @@ -577,7 +577,7 @@ task krona { Int? magnitude_column Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } Int disk_size = 50 @@ -684,7 +684,7 @@ task filter_bam_to_taxa { String out_filename_suffix = "filtered" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix @@ -771,7 +771,7 @@ task kaiju { File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } String input_basename = basename(reads_unmapped_bam, ".bam") diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index d5cb61530..b695ea7ea 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -401,7 +401,7 @@ task aggregate_metagenomics_reports { String aggregate_taxlevel_focus = "species" Int aggregate_top_N_hits = 5 - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } parameter_meta { @@ -549,7 +549,7 @@ task compare_two_genomes { File genome_two String out_basename - String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2" + String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" } Int disk_size = 50 diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index 6a654729d..905acf4be 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -14,7 +14,7 @@ task deplete_taxa { Int? cpu=8 Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } parameter_meta { @@ -113,7 +113,7 @@ task filter_to_taxon { String? neg_control_prefixes_space_separated = "neg water NTC" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } # do this in two steps in case the input doesn't actually have "cleaned" in the name @@ -172,7 +172,7 @@ task build_lastal_db { File sequences_fasta Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } String db_name = basename(sequences_fasta, ".fasta") diff --git a/requirements-modules.txt b/requirements-modules.txt index 100a605a7..71870496b 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,6 +1,6 @@ broadinstitute/viral-core=2.1.33 -broadinstitute/viral-assemble=2.1.20.2 -broadinstitute/viral-classify=2.1.20.0 +broadinstitute/viral-assemble=2.1.33.0 +broadinstitute/viral-classify=2.1.33.0 broadinstitute/viral-phylo=2.1.20.0 broadinstitute/py3-bio=0.1.2 broadinstitute/beast-beagle-cuda=1.10.5pre From df0cdd7d0a2997f275d864d8c7dacdf40ca3e73b Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 12 Dec 2022 11:16:56 -0500 Subject: [PATCH 27/29] bump nextclade --- pipes/WDL/tasks/tasks_nextstrain.wdl | 4 ++-- requirements-modules.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index d12608d31..4d71a512c 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -13,7 +13,7 @@ task nextclade_one_sample { File? pcr_primers_csv File? virus_properties String? dataset_name - String docker = "nextstrain/nextclade:2.5.0" + String docker = "nextstrain/nextclade:2.9.1" } String basename = basename(genome_fasta, ".fasta") Int disk_size = 50 @@ -100,7 +100,7 @@ task nextclade_many_samples { String? dataset_name String basename File? genome_ids_setdefault_blank - String docker = "nextstrain/nextclade:2.5.0" + String docker = "nextstrain/nextclade:2.9.1" } Int disk_size = 100 command <<< diff --git a/requirements-modules.txt b/requirements-modules.txt index 71870496b..cf317546e 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -8,4 +8,4 @@ broadinstitute/ncbi-tools=2.10.7.10 nextstrain/base=build-20211012T204409Z andersenlabapps/ivar=1.3.1 quay.io/staphb/pangolin=4.1.2-pdata-1.14 -nextstrain/nextclade=2.5.0 +nextstrain/nextclade=2.9.1 From c709a2294f11d06cd351ac8e0c9429b93da091bd Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 12 Dec 2022 14:19:28 -0500 Subject: [PATCH 28/29] bump viral-phylo including upstream bugfixes --- pipes/WDL/tasks/tasks_interhost.wdl | 6 +++--- pipes/WDL/tasks/tasks_intrahost.wdl | 6 +++--- pipes/WDL/tasks/tasks_ncbi.wdl | 12 ++++++------ pipes/WDL/tasks/tasks_nextstrain.wdl | 4 ++-- requirements-modules.txt | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index 7e51f28bc..b65d81f99 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -9,7 +9,7 @@ task multi_align_mafft_ref { Float? mafft_gapOpeningPenalty Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } String fasta_basename = basename(reference_fasta, '.fasta') @@ -56,7 +56,7 @@ task multi_align_mafft { Float? mafft_gapOpeningPenalty Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } Int disk_size = 200 @@ -282,7 +282,7 @@ task merge_vcfs_gatk { File ref_fasta Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" String output_prefix = "merged" } diff --git a/pipes/WDL/tasks/tasks_intrahost.wdl b/pipes/WDL/tasks/tasks_intrahost.wdl index 3735d7d6b..551f7d450 100644 --- a/pipes/WDL/tasks/tasks_intrahost.wdl +++ b/pipes/WDL/tasks/tasks_intrahost.wdl @@ -179,7 +179,7 @@ task isnvs_per_sample { Boolean removeDoublyMappedReads = true Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" String sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped") } @@ -222,7 +222,7 @@ task isnvs_vcf { Boolean naiveFilter = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } parameter_meta { @@ -296,7 +296,7 @@ task annotate_vcf_snpeff { String? emailAddress Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" String output_basename = basename(basename(in_vcf, ".gz"), ".vcf") } diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl index 519a6aa6d..73510da29 100644 --- a/pipes/WDL/tasks/tasks_ncbi.wdl +++ b/pipes/WDL/tasks/tasks_ncbi.wdl @@ -6,7 +6,7 @@ task download_fasta { Array[String]+ accessions String emailAddress - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } command { @@ -38,7 +38,7 @@ task download_annotations { String emailAddress String combined_out_prefix - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } command <<< @@ -85,7 +85,7 @@ task annot_transfer { File reference_fasta Array[File]+ reference_feature_table - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } parameter_meta { @@ -139,7 +139,7 @@ task align_and_annot_transfer_single { Array[File]+ reference_fastas Array[File]+ reference_feature_tables - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } parameter_meta { @@ -566,7 +566,7 @@ task biosample_to_genbank { File? filter_to_ids Boolean s_dropout_note = true - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } String base = basename(biosample_attributes, ".txt") command { @@ -732,7 +732,7 @@ task prepare_genbank { String? assembly_method_version Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index 4d71a512c..280f2cca0 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -949,7 +949,7 @@ task mafft_one_chr { Boolean large = false Boolean memsavetree = false - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" Int mem_size = 500 Int cpus = 64 } @@ -1037,7 +1037,7 @@ task mafft_one_chr_chunked { Int batch_chunk_size = 2000 Int threads_per_job = 2 - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" Int mem_size = 32 Int cpus = 96 } diff --git a/requirements-modules.txt b/requirements-modules.txt index cf317546e..e4d8b4c44 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,7 +1,7 @@ broadinstitute/viral-core=2.1.33 broadinstitute/viral-assemble=2.1.33.0 broadinstitute/viral-classify=2.1.33.0 -broadinstitute/viral-phylo=2.1.20.0 +broadinstitute/viral-phylo=2.1.20.1 broadinstitute/py3-bio=0.1.2 broadinstitute/beast-beagle-cuda=1.10.5pre broadinstitute/ncbi-tools=2.10.7.10 From 6484b9ea0e2c317bb1b8eab71d5021f19ab66145 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 12 Dec 2022 17:58:15 -0500 Subject: [PATCH 29/29] update viral-phylo docker --- pipes/WDL/tasks/tasks_interhost.wdl | 6 +++--- pipes/WDL/tasks/tasks_intrahost.wdl | 6 +++--- pipes/WDL/tasks/tasks_ncbi.wdl | 12 ++++++------ pipes/WDL/tasks/tasks_nextstrain.wdl | 4 ++-- requirements-modules.txt | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index b65d81f99..bf4fb1b2c 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -9,7 +9,7 @@ task multi_align_mafft_ref { Float? mafft_gapOpeningPenalty Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } String fasta_basename = basename(reference_fasta, '.fasta') @@ -56,7 +56,7 @@ task multi_align_mafft { Float? mafft_gapOpeningPenalty Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } Int disk_size = 200 @@ -282,7 +282,7 @@ task merge_vcfs_gatk { File ref_fasta Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" String output_prefix = "merged" } diff --git a/pipes/WDL/tasks/tasks_intrahost.wdl b/pipes/WDL/tasks/tasks_intrahost.wdl index 551f7d450..d2ad85c84 100644 --- a/pipes/WDL/tasks/tasks_intrahost.wdl +++ b/pipes/WDL/tasks/tasks_intrahost.wdl @@ -179,7 +179,7 @@ task isnvs_per_sample { Boolean removeDoublyMappedReads = true Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" String sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped") } @@ -222,7 +222,7 @@ task isnvs_vcf { Boolean naiveFilter = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } parameter_meta { @@ -296,7 +296,7 @@ task annotate_vcf_snpeff { String? emailAddress Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" String output_basename = basename(basename(in_vcf, ".gz"), ".vcf") } diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl index 73510da29..beddf490e 100644 --- a/pipes/WDL/tasks/tasks_ncbi.wdl +++ b/pipes/WDL/tasks/tasks_ncbi.wdl @@ -6,7 +6,7 @@ task download_fasta { Array[String]+ accessions String emailAddress - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } command { @@ -38,7 +38,7 @@ task download_annotations { String emailAddress String combined_out_prefix - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } command <<< @@ -85,7 +85,7 @@ task annot_transfer { File reference_fasta Array[File]+ reference_feature_table - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } parameter_meta { @@ -139,7 +139,7 @@ task align_and_annot_transfer_single { Array[File]+ reference_fastas Array[File]+ reference_feature_tables - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } parameter_meta { @@ -566,7 +566,7 @@ task biosample_to_genbank { File? filter_to_ids Boolean s_dropout_note = true - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } String base = basename(biosample_attributes, ".txt") command { @@ -732,7 +732,7 @@ task prepare_genbank { String? assembly_method_version Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index 280f2cca0..d077421ce 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -949,7 +949,7 @@ task mafft_one_chr { Boolean large = false Boolean memsavetree = false - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" Int mem_size = 500 Int cpus = 64 } @@ -1037,7 +1037,7 @@ task mafft_one_chr_chunked { Int batch_chunk_size = 2000 Int threads_per_job = 2 - String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.1" + String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2" Int mem_size = 32 Int cpus = 96 } diff --git a/requirements-modules.txt b/requirements-modules.txt index e4d8b4c44..cda9fbce8 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,7 +1,7 @@ broadinstitute/viral-core=2.1.33 broadinstitute/viral-assemble=2.1.33.0 broadinstitute/viral-classify=2.1.33.0 -broadinstitute/viral-phylo=2.1.20.1 +broadinstitute/viral-phylo=2.1.20.2 broadinstitute/py3-bio=0.1.2 broadinstitute/beast-beagle-cuda=1.10.5pre broadinstitute/ncbi-tools=2.10.7.10