From d87908236656f9abe835bb6098c9fa4f996c6f30 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 21 Dec 2022 09:25:57 -0500 Subject: [PATCH 1/2] allow classify_single to merge multiple input replicate bams --- pipes/WDL/tasks/tasks_read_utils.wdl | 2 +- pipes/WDL/workflows/classify_single.wdl | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl index 46dac190e..dd1db3a69 100644 --- a/pipes/WDL/tasks/tasks_read_utils.wdl +++ b/pipes/WDL/tasks/tasks_read_utils.wdl @@ -144,7 +144,7 @@ task merge_and_reheader_bams { Array[File]+ in_bams String? sample_name File? reheader_table - String out_basename + String out_basename = basename(in_bams[0], ".bam") String docker = "quay.io/broadinstitute/viral-core:2.1.33" } diff --git a/pipes/WDL/workflows/classify_single.wdl b/pipes/WDL/workflows/classify_single.wdl index e26c05b34..3d7d6f9c2 100644 --- a/pipes/WDL/workflows/classify_single.wdl +++ b/pipes/WDL/workflows/classify_single.wdl @@ -14,7 +14,7 @@ workflow classify_single { } input { - File reads_bam + Array[File]+ reads_bams File ncbi_taxdump_tgz @@ -26,8 +26,8 @@ workflow classify_single { } parameter_meta { - reads_bam: { - description: "Reads to classify. May be unmapped or mapped or both, paired-end or single-end.", + reads_bams: { + description: "Reads to classify. May be unmapped or mapped or both, paired-end or single-end. Multiple input files will be merged first.", patterns: ["*.bam"] } spikein_db: { @@ -52,9 +52,12 @@ workflow classify_single { } } - call reports.fastqc as fastqc_raw { - input: reads_bam = reads_bam + call read_utils.merge_and_reheader_bams as merge_raw_reads { + input: + in_bams = reads_bams } + File reads_bam = merge_raw_reads.out_bam + call reports.align_and_count as spikein { input: reads_bam = reads_bam, @@ -110,6 +113,10 @@ workflow classify_single { File kraken2_summary_report = kraken2.kraken2_summary_report File kraken2_krona_plot = kraken2.krona_report_html + File raw_fastqc = merge_raw_reads.fastqc + File cleaned_fastqc = fastqc_cleaned.fastqc_html + File spikein_report = spikein.report + String spikein_tophit = spikein.top_hit_id String kraken2_viral_classify_version = kraken2.viralngs_version String deplete_viral_classify_version = deplete.viralngs_version From e986cc1d807f5ea305db48755040809e9db3f503 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 21 Dec 2022 10:05:01 -0500 Subject: [PATCH 2/2] update task kraken2 wdl syntax and bump default ram to 72gb --- pipes/WDL/tasks/tasks_metagenomics.wdl | 66 +++++++++++++------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index d9b806193..127d354b4 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -209,7 +209,7 @@ task kraken2 { Float? confidence_threshold Int? min_base_qual - Int? machine_mem_gb + Int machine_mem_gb = 72 String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" } @@ -237,7 +237,7 @@ task kraken2 { String out_basename = basename(basename(reads_bam, '.bam'), '.fasta') Int disk_size = 750 - command { + command <<< set -ex -o pipefail if [ -z "$TMPDIR" ]; then @@ -248,72 +248,72 @@ task kraken2 { # decompress DB to $DB_DIR read_utils.py extract_tarball \ - ${kraken2_db_tgz} $DB_DIR/kraken2 \ + "~{kraken2_db_tgz}" $DB_DIR/kraken2 \ --loglevel=DEBUG du -hs $DB_DIR/kraken2 # unpack krona taxonomy.tab - if [[ ${krona_taxonomy_db_tgz} == *.tar.* ]]; then + if [[ "~{krona_taxonomy_db_tgz}" == *.tar.* ]]; then read_utils.py extract_tarball \ - ${krona_taxonomy_db_tgz} $DB_DIR/krona \ + "~{krona_taxonomy_db_tgz}" $DB_DIR/krona \ --loglevel=DEBUG & # we don't need this until later else - if [[ "${krona_taxonomy_db_tgz}" == *.zst ]]; then - cat "${krona_taxonomy_db_tgz}" | zstd -d > $DB_DIR/krona/taxonomy.tab & - elif [[ "${krona_taxonomy_db_tgz}" == *.gz ]]; then - cat "${krona_taxonomy_db_tgz}" | pigz -dc > $DB_DIR/krona/taxonomy.tab & - elif [[ "${krona_taxonomy_db_tgz}" == *.bz2 ]]; then - cat "${krona_taxonomy_db_tgz}" | bzip -dc > $DB_DIR/krona/taxonomy.tab & + if [[ "~{krona_taxonomy_db_tgz}" == *.zst ]]; then + cat "~{krona_taxonomy_db_tgz}" | zstd -d > $DB_DIR/krona/taxonomy.tab & + elif [[ "~{krona_taxonomy_db_tgz}" == *.gz ]]; then + cat "~{krona_taxonomy_db_tgz}" | pigz -dc > $DB_DIR/krona/taxonomy.tab & + elif [[ "~{krona_taxonomy_db_tgz}" == *.bz2 ]]; then + cat "~{krona_taxonomy_db_tgz}" | bzip -dc > $DB_DIR/krona/taxonomy.tab & else - cp "${krona_taxonomy_db_tgz}" $DB_DIR/krona/taxonomy.tab & + cp "~{krona_taxonomy_db_tgz}" $DB_DIR/krona/taxonomy.tab & fi fi metagenomics.py --version | tee VERSION - if [[ ${reads_bam} == *.bam ]]; then + if [[ "~{reads_bam}" == *.bam ]]; then metagenomics.py kraken2 \ $DB_DIR/kraken2 \ - ${reads_bam} \ - --outReads "${out_basename}".kraken2.reads.txt \ - --outReports "${out_basename}".kraken2.report.txt \ - ${"--confidence " + confidence_threshold} \ - ${"--min_base_qual " + min_base_qual} \ + "~{reads_bam}" \ + --outReads "~{out_basename}".kraken2.reads.txt \ + --outReports "~{out_basename}".kraken2.report.txt \ + ~{"--confidence " + confidence_threshold} \ + ~{"--min_base_qual " + min_base_qual} \ --loglevel=DEBUG else # fasta input file: call kraken2 directly kraken2 \ --db $DB_DIR/kraken2 \ - ${reads_bam} \ - --output "${out_basename}".kraken2.reads.txt \ - --report "${out_basename}".kraken2.report.txt \ - ${"--confidence " + confidence_threshold} \ - ${"--min_base_qual " + min_base_qual} + "~{reads_bam}" \ + --output "~{out_basename}".kraken2.reads.txt \ + --report "~{out_basename}".kraken2.report.txt \ + ~{"--confidence " + confidence_threshold} \ + ~{"--min_base_qual " + min_base_qual} fi wait # for krona_taxonomy_db_tgz to download and extract - pigz "${out_basename}".kraken2.reads.txt & + pigz "~{out_basename}".kraken2.reads.txt & metagenomics.py krona \ - "${out_basename}".kraken2.report.txt \ + "~{out_basename}".kraken2.report.txt \ $DB_DIR/krona \ - "${out_basename}".kraken2.krona.html \ - --sample_name "${out_basename}" \ + "~{out_basename}".kraken2.krona.html \ + --sample_name "~{out_basename}" \ --noRank --noHits --inputType kraken2 \ --loglevel=DEBUG wait # pigz reads.txt - } + >>> output { - File kraken2_reads_report = "${out_basename}.kraken2.reads.txt.gz" - File kraken2_summary_report = "${out_basename}.kraken2.report.txt" - File krona_report_html = "${out_basename}.kraken2.krona.html" + File kraken2_reads_report = "~{out_basename}.kraken2.reads.txt.gz" + File kraken2_summary_report = "~{out_basename}.kraken2.report.txt" + File krona_report_html = "~{out_basename}.kraken2.krona.html" String viralngs_version = read_string("VERSION") } runtime { - docker: "${docker}" - memory: select_first([machine_mem_gb, 52]) + " GB" + docker: docker + memory: machine_mem_gb + " GB" cpu: 8 disks: "local-disk " + disk_size + " LOCAL" disk: disk_size + " GB" # TESs