From 34a64fb93c37c9169f2632ffb65092d6e9c65d10 Mon Sep 17 00:00:00 2001 From: Curtis Kapsak Date: Mon, 4 Nov 2024 11:41:45 -0500 Subject: [PATCH 1/4] [Mercury] bump mercury docker to 1.0.9: bugfix for GISAID metadata covv_coverage column (#661) * bump mercury docker to 1.0.9 * download_terra_table task: added set -euo pipefail, increased default memory to 2GB, ensured premptible is off and enabled memory retry feature for very large terra tables that need more RAM to download * updated docs with new mercury docker default and memory values for download_terra_table task * update mercury docs to list correct column used for assembly_mean_coverage_column_name * corrected another old mention of clearlabs_assembly_coverage --- .../phylogenetic_construction/czgenepi_prep.md | 2 +- .../public_data_sharing/mercury_prep_n_batch.md | 12 ++++++------ .../data_export/task_download_terra_table.wdl | 7 ++++++- tasks/utilities/submission/task_mercury.wdl | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/workflows/phylogenetic_construction/czgenepi_prep.md b/docs/workflows/phylogenetic_construction/czgenepi_prep.md index 27b0913b6..26036fcdd 100644 --- a/docs/workflows/phylogenetic_construction/czgenepi_prep.md +++ b/docs/workflows/phylogenetic_construction/czgenepi_prep.md @@ -26,7 +26,7 @@ This workflow runs on the set level. | czgenepi_prep | **terra_table_name** | String | The name of the Terra table where the data is hosted | | Required | | czgenepi_prep | **terra_project_name** | String | The name of the Terra project where the data is hosted | | Required | | czgenepi_prep | **terra_workspace_name** | String | The name of the Terra workspace where the data is hosted | | Required | -| download_terra_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 10 | Optional | +| download_terra_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | | download_terra_table | **docker** | String | The Docker container to use for the task | quay.io/theiagen/terra-tools:2023-06-21 | Optional | | download_terra_table | **disk_size** | String | The size of the disk used when running this task | 1 | Optional | | download_terra_table | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | diff --git a/docs/workflows/public_data_sharing/mercury_prep_n_batch.md b/docs/workflows/public_data_sharing/mercury_prep_n_batch.md index 459d3be88..4fcc48d36 100644 --- a/docs/workflows/public_data_sharing/mercury_prep_n_batch.md +++ b/docs/workflows/public_data_sharing/mercury_prep_n_batch.md @@ -56,7 +56,7 @@ To help users collect all required metadata, we have created the following Excel | --- | --- | --- | --- | --- | | `read1_column_name` | `"read1_dehosted"` | `"clearlabs_fastq_gz"` | `"reads_dehosted"` | `"reads_dehosted"` | | `assembly_fasta_column_name` | `"assembly_fasta"` | `"clearlabs_fasta"` | `"assembly_fasta"` | `"clearlabs_fasta"` | - | `assembly_mean_coverage_column_name` | `"assembly_mean_coverage"` | `"clearlabs_assembly_coverage"` | `"assembly_mean_coverage"` | `"clearlabs_assembly_coverage"` | + | `assembly_mean_coverage_column_name` | `"assembly_mean_coverage"` | `"clearlabs_sequencing_depth"` | `"assembly_mean_coverage"` | `"clearlabs_sequencing_depth"` | ### Inputs @@ -77,16 +77,16 @@ This workflow runs on the set-level. | download_terra_table | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | | download_terra_table | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 10 | Optional | | download_terra_table | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-06-21 | Optional | -| download_terra_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 1 | Optional | +| download_terra_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | | mercury | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | -| mercury | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | -| mercury | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.7 | Optional | -| mercury | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| mercury | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| mercury | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.9 | Optional | +| mercury | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | mercury | **number_N_threshold** | Int | Only for "sars-cov-2" submissions; used to filter out any samples that contain more than the indicated number of Ns in the assembly file | 5000 | Optional | | mercury | **single_end** | Boolean | Set to true if your data is single-end; this ensures that a read2 column is not included in the metadata | FALSE | Optional | | mercury | **skip_county** | Boolean | Use if your Terra table contains a county column that you do not want to include in your submission. | FALSE | Optional | | mercury | **usa_territory** | Boolean | If true, the "state" column will be used in place of the "country" column. For example, if "state" is Puerto Rico, then the GISAID virus name will be `hCoV-19/Puerto Rico//`. The NCBI `geo_loc_name` will be "USA: Puerto Rico". This optional Boolean variable should only be used with clear understanding of what it does. | FALSE | Optional | -| mercury | **using_clearlabs_data** | Boolean | When set to true will change read1_dehosted → clearlabs_fastq_gz; assembly_fasta → clearlabs_fasta; assembly_mean_coverage → clearlabs_assembly_coverage | FALSE | Optional | +| mercury | **using_clearlabs_data** | Boolean | When set to `true` will change `read1_dehosted` → `clearlabs_fastq_gz`; `assembly_fasta` → `clearlabs_fasta`; `assembly_mean_coverage` → `clearlabs_sequencing_depth` | FALSE | Optional | | mercury | **using_reads_dehosted** | Boolean | When set to true will only change read1_dehosted → reads_dehosted. Takes priority over the replacement for read1_dehosted made with the using_clearlabs_data Boolean input | FALSE | Optional | | mercury | **vadr_alert_limit** | Int | Only for "sars-cov-2" submissions; used to filter out any samples that contain more than the indicated number of vadr alerts | 0 | Optional | | mercury_prep_n_batch | **authors_sbt** | File | Only for "mpox" submissions; a file that contains author information. This file can be created here: | | Optional | diff --git a/tasks/utilities/data_export/task_download_terra_table.wdl b/tasks/utilities/data_export/task_download_terra_table.wdl index 3068de730..e2019a910 100644 --- a/tasks/utilities/data_export/task_download_terra_table.wdl +++ b/tasks/utilities/data_export/task_download_terra_table.wdl @@ -12,11 +12,14 @@ task download_terra_table { String terra_workspace_name String terra_project_name Int disk_size = 10 - Int memory = 1 + Int memory = 2 Int cpu = 1 String docker = "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-06-21" } command <<< + # set -euo pipefail to avoid silent failure + set -euo pipefail + python3 /scripts/export_large_tsv/export_large_tsv.py --project ~{terra_project_name} --workspace ~{terra_workspace_name} --entity_type ~{terra_table_name} --tsv_filename "~{terra_table_name}.tsv" >>> output { @@ -29,5 +32,7 @@ task download_terra_table { disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" dx_instance_type: "mem1_ssd1_v2_x2" + preemptible: 0 # this task may take a long time and shouldn't be preempted + maxRetries: 3 } } \ No newline at end of file diff --git a/tasks/utilities/submission/task_mercury.wdl b/tasks/utilities/submission/task_mercury.wdl index d445e92aa..481645bb2 100644 --- a/tasks/utilities/submission/task_mercury.wdl +++ b/tasks/utilities/submission/task_mercury.wdl @@ -23,7 +23,7 @@ task mercury { Int cpu = 2 Int disk_size = 100 Int memory = 8 - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.8" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.9" } meta { volatile: true From 3dda02c9f292792d06fa3cd4c7ab2daea89154c9 Mon Sep 17 00:00:00 2001 From: Fraser Combe Date: Mon, 4 Nov 2024 12:36:35 -0600 Subject: [PATCH 2/4] [TheiaCov] wfs add percentage_mapped_reads (#641) * Added percentage_mapped_reads output to ivar_consensus.wdl and updated stats_n_coverage task * update mapped reads trying read_float * get read numbers from stats file * get read numbers from stats filev2 * change from bc to awk for calculation * update awk * metric output txt instead of csv * reswitchack to read_string output t * percentage mapped reads based on trimmed bam file theiacov_ont * update theiacov-ont for mapped reads * pass output ivar cons mapped reads to wf for terra output * perc mapped reads output flu track PE, ONT and clearlabs and doc update * updated namings outputs cov_ONT and removed extra call assembly metrics * change naming output stat n coverage task * update flu mapped reads perc variable name * make theiacov_ont conditional output flu mapped reads * wdl does not support if cond in output change to select first * wdl does not support if cond in output change to select first * combine flu and non flu into same mapped reads output * correct assembled reads call * update mdsums * update clearlabs for statncov call * float? * mdsums and pe wf update flu irma defined * mdsum pe * move to flue track * tidy output pe and ont * update strings and provide default values * clean tab/spaces echo * my fav commit mdsums! * remove extra space because it was bothering me * adding a space because i am crazy * update flu output * update outputs * remove string in name * correctly name percentage mapped reads * correct output in flu track wdl * update mdsums * primtrim output * another mdsum --------- Co-authored-by: Sage Wright --- .../genomic_characterization/theiacov.md | 1 + .../task_assembly_metrics.wdl | 35 +++++++++++++++++-- .../theiacov/test_wf_theiacov_clearlabs.yml | 4 +-- .../theiacov/test_wf_theiacov_illumina_pe.yml | 4 +-- .../theiacov/test_wf_theiacov_illumina_se.yml | 4 +-- .../theiacov/test_wf_theiacov_ont.yml | 4 +-- workflows/theiacov/wf_theiacov_clearlabs.wdl | 2 ++ .../theiacov/wf_theiacov_illumina_pe.wdl | 5 +-- .../theiacov/wf_theiacov_illumina_se.wdl | 2 ++ workflows/theiacov/wf_theiacov_ont.wdl | 4 ++- workflows/utilities/wf_flu_track.wdl | 7 +++- workflows/utilities/wf_ivar_consensus.wdl | 4 ++- 12 files changed, 61 insertions(+), 15 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index 2e72b86d4..ba7eac3c4 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -1157,6 +1157,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | pangolin_notes | String | Lineage notes as determined by Pangolin | CL, FASTA, ONT, PE, SE | | pangolin_versions | String | All Pangolin software and database versions | CL, FASTA, ONT, PE, SE | | percent_reference_coverage | Float | Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of the reference genome (SC2: 29903) x 100 | CL, FASTA, ONT, PE, SE | +| percentage_mapped_reads | String | Percentage of reads that successfully aligned to the reference genome. This value is calculated by number of mapped reads / total number of reads x 100. | ONT, PE, SE | | primer_bed_name | String | Name of the primer bed files used for primer trimming | CL, ONT, PE, SE | | primer_trimmed_read_percent | Float | Percentage of read data with primers trimmed as determined by iVar trim | PE, SE | | qc_check | String | The results of the QC Check task | CL, FASTA, ONT, PE, SE | diff --git a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl index 9fe5f843b..762db23cf 100644 --- a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl +++ b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl @@ -14,11 +14,11 @@ task stats_n_coverage { samtools --version | head -n1 | tee VERSION samtools stats ~{bamfile} > ~{samplename}.stats.txt - samtools coverage ~{bamfile} -m -o ~{samplename}.cov.hist samtools coverage ~{bamfile} -o ~{samplename}.cov.txt samtools flagstat ~{bamfile} > ~{samplename}.flagstat.txt + # Extracting coverage, depth, meanbaseq, and meanmapq coverage=$(cut -f 6 ~{samplename}.cov.txt | tail -n 1) depth=$(cut -f 7 ~{samplename}.cov.txt | tail -n 1) meanbaseq=$(cut -f 8 ~{samplename}.cov.txt | tail -n 1) @@ -33,6 +33,34 @@ task stats_n_coverage { echo $depth | tee DEPTH echo $meanbaseq | tee MEANBASEQ echo $meanmapq | tee MEANMAPQ + + # Parsing stats.txt for total and mapped reads + total_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "raw total sequences:" | cut -f 3) + mapped_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "reads mapped:" | cut -f 3) + + # Check for empty values and set defaults to avoid errors + if [ -z "$total_reads" ]; then total_reads="1"; fi # Avoid division by zero + if [ -z "$mapped_reads" ]; then mapped_reads="0"; fi + + # Calculate the percentage of mapped reads + percentage_mapped_reads=$(awk "BEGIN {printf \"%.2f\", ($mapped_reads / $total_reads) * 100}") + + # If the percentage calculation fails, default to 0.0 + if [ -z "$percentage_mapped_reads" ]; then percentage_mapped_reads="0.0"; fi + + # Output the result + echo $percentage_mapped_reads | tee PERCENTAGE_MAPPED_READS + + #output all metrics in one txt file + # Output header row (for CSV) + echo -e "Statistic\tValue" > ~{samplename}_metrics.txt + + # Output each statistic as a row + echo -e "Coverage\t$coverage" >> ~{samplename}_metrics.txt + echo -e "Depth\t$depth" >> ~{samplename}_metrics.txt + echo -e "Mean Base Quality\t$meanbaseq" >> ~{samplename}_metrics.txt + echo -e "Mean Mapping Quality\t$meanmapq" >> ~{samplename}_metrics.txt + echo -e "Percentage Mapped Reads\t$percentage_mapped_reads" >> ~{samplename}_metrics.txt >>> output { String date = read_string("DATE") @@ -45,6 +73,9 @@ task stats_n_coverage { Float depth = read_string("DEPTH") Float meanbaseq = read_string("MEANBASEQ") Float meanmapq = read_string("MEANMAPQ") + Float percentage_mapped_reads = read_string("PERCENTAGE_MAPPED_READS") + File metrics_txt = "~{samplename}_metrics.txt" + } runtime { docker: docker @@ -55,4 +86,4 @@ task stats_n_coverage { preemptible: 0 maxRetries: 3 } -} \ No newline at end of file +} diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml index 599fec45e..fe20263a2 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml @@ -318,7 +318,7 @@ - path: miniwdl_run/call-pangolin4/work/clearlabs.pangolin_report.csv md5sum: 151390c419b00ca44eb83e2bbfb96996 - path: miniwdl_run/call-stats_n_coverage/command - md5sum: 51da320ddc7de2ffeb263f0ddd85ced6 + md5sum: ac020678f99ac145b11d3dbc7b9fe9ba - path: miniwdl_run/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage/outputs.json @@ -350,7 +350,7 @@ - path: miniwdl_run/call-stats_n_coverage/work/clearlabs.stats.txt md5sum: bfed5344c91ce6f4db1f688cac0a3ab9 - path: miniwdl_run/call-stats_n_coverage_primtrim/command - md5sum: a84f90b8877babe54bf8c068d244fbe8 + md5sum: 2974f886e1959cd5eaae5e495c91f7cc - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml index 4c7542334..dfef9c994 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml @@ -205,7 +205,7 @@ md5sum: 511e696afe25f8b96a84d68ec5a8af8a # stats n coverage primer trim - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command - md5sum: 260c3887be6d99b18caf6d3914c5737f + md5sum: 1c61b89c2a94e87518a6679a04885341 - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json @@ -288,7 +288,7 @@ md5sum: 6c63395a125f8618334b8af2de4e2d88 # stats n coverage - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command - md5sum: 1ffac4cc3e9bdd84a0f9228e8e5ca5d9 + md5sum: e49a297b1c0eb195a2acd80f00672668 - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml index 0742c19a9..22453bdd5 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml @@ -157,7 +157,7 @@ md5sum: 603c3cbc771ca910b96d3c032aafe7c9 # stats n coverage primer trim - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command - md5sum: 67cac223adcf059a9dfaa9f28ed34f68 + md5sum: affacdcfda48ad5e371a4510f19520bd - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json @@ -240,7 +240,7 @@ md5sum: 03c5ecf22fdfdb6b240ac3880281a056 # stats n coverage - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command - md5sum: 3dacccb252429a0ff46c079a75a09377 + md5sum: cb4de0e459b3fada21bcf08a8dbea89f - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml index 6077323b9..333f39b90 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml @@ -225,7 +225,7 @@ md5sum: 32c0be4fb7f3030bf9c74c0a836d4f2e - path: miniwdl_run/call-raw_check_reads/work/_miniwdl_inputs/0/ont.fastq.gz - path: miniwdl_run/call-stats_n_coverage/command - md5sum: 93414eacbbb9d7c4813bb54a8a507078 + md5sum: fbd85e82af1bbfaa734a13a9c1394300 - path: miniwdl_run/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage/outputs.json @@ -250,7 +250,7 @@ - path: miniwdl_run/call-stats_n_coverage/work/ont.flagstat.txt - path: miniwdl_run/call-stats_n_coverage/work/ont.stats.txt - path: miniwdl_run/call-stats_n_coverage_primtrim/command - md5sum: c6e7de70dfdbb1858229e44777b84110 + md5sum: 3689a902aa96e8c132e6ef4946699e61 - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl index 8932c983f..0368bef95 100644 --- a/workflows/theiacov/wf_theiacov_clearlabs.wdl +++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl @@ -207,6 +207,8 @@ workflow theiacov_clearlabs { Int number_Degenerate = consensus_qc.number_Degenerate Int number_Total = consensus_qc.number_Total Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Percentage mapped reads + Float percentage_mapped_reads = stats_n_coverage.percentage_mapped_reads # SC2 specific coverage outputs Float? sc2_s_gene_mean_coverage = gene_coverage.sc2_s_gene_depth Float? sc2_s_gene_percent_coverage = gene_coverage.sc2_s_gene_percent_coverage diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index ece3c201a..5f5e5b651 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -144,7 +144,7 @@ workflow theiacov_illumina_pe { trim_primers = trim_primers } } - # perform flu-specific tasks + # for flu organisms call flu_track if (organism_parameters.standardized_organism == "flu") { call run_flu_track.flu_track { input: @@ -439,6 +439,7 @@ workflow theiacov_illumina_pe { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard - + # Capture percentage_mapped_reads from ivar_consensus task or flu_track task + String percentage_mapped_reads = select_first([ivar_consensus.percentage_mapped_reads, flu_track.percentage_mapped_reads, ""]) } } diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index fa1044c24..4ae59f5dc 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -317,5 +317,7 @@ workflow theiacov_illumina_se { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard + # Capture percentage_mapped_reads from ivar_consensus task + String? percentage_mapped_reads = ivar_consensus.percentage_mapped_reads } } \ No newline at end of file diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index 984934062..7d8d29ad4 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -149,7 +149,7 @@ workflow theiacov_ont { standardized_organism = organism_parameters.standardized_organism, seq_method = seq_method } - } + } # nanoplot for basic QC metrics call nanoplot_task.nanoplot as nanoplot_raw { input: @@ -427,5 +427,7 @@ workflow theiacov_ont { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard + # Non-flu specific outputs + String percentage_mapped_reads = select_first([stats_n_coverage_primtrim.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads, flu_track.percentage_mapped_reads, ""]) } } \ No newline at end of file diff --git a/workflows/utilities/wf_flu_track.wdl b/workflows/utilities/wf_flu_track.wdl index 71e8e952d..6bb2f8c85 100644 --- a/workflows/utilities/wf_flu_track.wdl +++ b/workflows/utilities/wf_flu_track.wdl @@ -114,6 +114,10 @@ workflow flu_track { } # combine HA & NA assembly coverages String ha_na_assembly_coverage_string = "HA: " + select_first([ha_assembly_coverage.depth, ""]) + ", NA: " + select_first([na_assembly_coverage.depth, ""]) + + # combine HA & NA mapped reads percentages + String ha_na_percentage_mapped_reads = "HA: " + select_first([ha_assembly_coverage.percentage_mapped_reads, ""]) + ", NA: " + select_first([na_assembly_coverage.percentage_mapped_reads, ""]) + # ABRICATE will run if assembly is provided, or was generated with IRMA if (defined(irma.irma_assemblies) && defined(irma.irma_assembly_fasta)){ call abricate.abricate_flu { @@ -249,13 +253,14 @@ workflow flu_track { File? irma_mp_segment_fasta = irma.seg_mp_assembly File? irma_np_segment_fasta = irma.seg_np_assembly File? irma_ns_segment_fasta = irma.seg_ns_assembly - Array[File] irma_assemblies = irma.irma_assemblies Array[File] irma_vcfs = irma.irma_vcfs Array[File] irma_bams = irma.irma_bams File? irma_ha_bam = irma.seg_ha_bam File? irma_na_bam = irma.seg_na_bam String ha_na_assembly_coverage = ha_na_assembly_coverage_string + # calulate mapped reads percentage for flu samples + String percentage_mapped_reads = ha_na_percentage_mapped_reads # GenoFLU outputs String? genoflu_version = genoflu.genoflu_version String? genoflu_genotype = genoflu.genoflu_genotype diff --git a/workflows/utilities/wf_ivar_consensus.wdl b/workflows/utilities/wf_ivar_consensus.wdl index 8dadbd2b9..4e7112943 100644 --- a/workflows/utilities/wf_ivar_consensus.wdl +++ b/workflows/utilities/wf_ivar_consensus.wdl @@ -153,6 +153,8 @@ workflow ivar_consensus { String meanmapq_trim = select_first([stats_n_coverage_primtrim.meanmapq, stats_n_coverage.meanmapq,""]) String assembly_mean_coverage = select_first([stats_n_coverage_primtrim.depth, stats_n_coverage.depth,""]) String samtools_version_stats = stats_n_coverage.samtools_version - + + # Assembly metrics + String percentage_mapped_reads = select_first([stats_n_coverage_primtrim.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads,""]) } } From 656634447bde7784b9b01c73a75be74e80d8e19e Mon Sep 17 00:00:00 2001 From: Fraser Combe Date: Tue, 5 Nov 2024 11:00:00 -0600 Subject: [PATCH 3/4] Update MIDAS database documentation in TheiaProk (#667) --- .../genomic_characterization/theiaprok.md | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 35422e658..2b5f5308d 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -686,7 +686,34 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al - relative_abundance: estimated relative abundance of species in metagenome The value in the `midas_primary_genus` column is derived by ordering the rows in order of "relative_abundance" and identifying the genus of top species in the "species_id" column (Salmonella). The value in the `midas_secondary_genus` column is derived from the genus of the second-most prevalent genus in the "species_id" column (Citrobacter). The `midas_secondary_genus_abundance` column is the "relative_abundance" of the second-most prevalent genus (0.009477003). The `midas_secondary_genus_coverage` is the "coverage" of the second-most prevalent genus (0.995216227). - + + **MIDAS Reference Database Overview** + + The **MIDAS reference database** is a comprehensive tool for identifying bacterial species in metagenomic and bacterial isolate WGS data. It includes several layers of genomic data, helping detect species abundance and potential contaminants. + + **Key Components of the MIDAS Database** + + 1. **Species Groups**: + - MIDAS clusters bacterial genomes based on 96.5% sequence identity, forming over 5,950 species groups from 31,007 genomes. These groups align with the gold-standard species definition (95% ANI), ensuring highly accurate species identification. + + 2. **Genomic Data Structure**: + - **Marker Genes**: Contains 15 universal single-copy genes used to estimate species abundance. + - **Representative Genome**: Each species group has a selected representative genome, which minimizes genetic variation and aids in accurate SNP identification. + - **Pan-genome**: The database includes clusters of non-redundant genes, with options for multi-level clustering (e.g., 99%, 95%, 90% identity), enabling MIDAS to identify gene content within strains at various clustering thresholds. + + 3. **Taxonomic Annotation**: + - Genomes are annotated based on consensus Latin names. Discrepancies in name assignments may occur due to factors like unclassified genomes or genus-level ambiguities. + + --- + + **Using the Default MIDAS Database** + + TheiaProk uses the pre-loaded MIDAS database in Terra (see input table for current version) by default for bacterial species detection in metagenomic data, requiring no additional setup. + + **How to Set Up the Default MIDAS Database** + + Users can also build their own custom MIDAS database if they want to include specific genomes or configurations. This custom database can replace the default MIDAS database used in Terra. To build a custom MIDAS database, follow the [MIDAS GitHub guide on building a custom database](https://github.com/snayfach/MIDAS/blob/master/docs/build_db.md). Once the database is built, users can upload it to a Google Cloud Storage bucket or Terra workkspace and provide the link to the database in the `midas_db` input variable. + Alternatively to `MIDAS`, the `Kraken2` task can also be turned on through setting the `call_kraken` input variable as `true` for the identification of reads to detect contamination with non-target taxa. Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate) whole genome sequence data. A database must be provided if this optional module is activated, through the kraken_db optional input. A list of suggested databases can be found on [Kraken2 standalone documentation](../standalone/kraken2.md). From 4bcf3f2c98e56a482e5b9d993bd2a4231fc825fa Mon Sep 17 00:00:00 2001 From: James Richard Otieno Date: Wed, 6 Nov 2024 11:59:27 -0500 Subject: [PATCH 4/4] Add Snippy_Variants QC outputs to Snippy_Tree and Snippy_Sreamline workflow outputs (#592) * capture qc metrics from snipy variants in snippy_tree and snippy_streamline * adding combined qc output to snippy streamline * correct output * merge conflicts outputs snippy tidy up * updates qc metrics logic and elif * update qc metrics combined so output is a tsv file anme in snippytree wf * update docs for snippy streamline and snippy tree.md * forgot the comma * update error messages snippy variants * updates streamline_fasta and docs * forgot the comma * update documentation snippy qc description * moved task snippy variants up a few lines * update docs for qc metrics output table and task dropdown * update streamline fasta docs --------- Co-authored-by: fraser-combe --- .../snippy_streamline.md | 27 +++++++++++++ .../snippy_streamline_fasta.md | 29 ++++++++++++++ .../phylogenetic_construction/snippy_tree.md | 27 +++++++++++++ .../snippy_variants.md | 7 ++++ .../task_snippy_variants.wdl | 40 +++++++++++++++++-- .../phylogenetics/wf_snippy_streamline.wdl | 4 +- .../wf_snippy_streamline_fasta.wdl | 4 +- workflows/phylogenetics/wf_snippy_tree.wdl | 12 ++++++ .../standalone_modules/wf_snippy_variants.wdl | 1 + 9 files changed, 146 insertions(+), 5 deletions(-) diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline.md b/docs/workflows/phylogenetic_construction/snippy_streamline.md index ed70f28be..c794be4c8 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline.md @@ -173,6 +173,32 @@ For all cases: `Snippy_Variants` aligns reads for each sample against the reference genome. As part of `Snippy_Streamline`, the only output from this workflow is the `snippy_variants_outdir_tarball` which is provided in the set-level data table. Please see the full documentation for [Snippy_Variants](./snippy_variants.md) for more information. +??? task "snippy_variants (qc_metrics output)" + + ##### snippy_variants {#snippy_variants} + + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≥ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + ??? task "Snippy_Tree workflow" ##### Snippy_Tree {#snippy_tree} @@ -194,6 +220,7 @@ For all cases: | snippy_centroid_version | String | Centroid version used | | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | | snippy_concatenated_variants | File | The concatenated variants file | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | | snippy_final_tree | File | Final phylogenetic tree produced by Snippy_Streamline | diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md index 0e9680518..352d5a55c 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -37,6 +37,34 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a **If reference genomes have multiple contigs, they will not be compatible with using Gubbins** to mask recombination in the phylogenetic tree. The automatic selection of a reference genome by the workflow may result in a reference with multiple contigs. In this case, an alternative reference genome should be sought. +### Workflow Tasks + +??? task "snippy_variants (qc_metrics output)" + + ##### snippy_variants {#snippy_variants} + + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≥ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + ### Inputs
@@ -123,6 +151,7 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a | snippy_centroid_samplename | String | Name of the centroid sample | | snippy_centroid_version | String | Centroid version used | | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. | | snippy_concatenated_variants | File | The concatenated variants file | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | diff --git a/docs/workflows/phylogenetic_construction/snippy_tree.md b/docs/workflows/phylogenetic_construction/snippy_tree.md index 4c9c7b02b..d6c0a272b 100644 --- a/docs/workflows/phylogenetic_construction/snippy_tree.md +++ b/docs/workflows/phylogenetic_construction/snippy_tree.md @@ -310,6 +310,32 @@ Sequencing data used in the Snippy_Tree workflow must: | Task | task_shared_variants.wdl | | Software Source Code | [task_shared_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_shared_variants.wdl) | +??? task "snippy_variants (qc_metrics output)" + + ##### snippy_variants {#snippy_variants} + + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≥ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + ### Outputs
@@ -318,6 +344,7 @@ Sequencing data used in the Snippy_Tree workflow must: |---|---|---| | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | | snippy_concatenated_variants | File | Concatenated snippy_results file across all samples in the set | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | | snippy_final_tree | File | Newick tree produced from the final alignment. Depending on user input for core_genome, the tree could be a core genome tree (default when core_genome is true) or whole genome tree (if core_genome is false) | diff --git a/docs/workflows/phylogenetic_construction/snippy_variants.md b/docs/workflows/phylogenetic_construction/snippy_variants.md index b62d1fbb3..4ec73569a 100644 --- a/docs/workflows/phylogenetic_construction/snippy_variants.md +++ b/docs/workflows/phylogenetic_construction/snippy_variants.md @@ -62,6 +62,13 @@ The `Snippy_Variants` workflow aligns single-end or paired-end reads (in FASTQ f `Snippy_Variants` uses the snippy tool to align reads to the reference and call SNPs, MNPs and INDELs according to optional input parameters. The output includes a file of variants that is then queried using the `grep` bash command to identify any mutations in specified genes or annotations of interest. The query string MUST match the gene name or annotation as specified in the GenBank file and provided in the output variant file in the `snippy_results` column. +Additionally, `Snippy_Variants` extracts quality control (QC) metrics from the Snippy output for each sample. These per-sample QC metrics are saved in TSV files (`snippy_variants_qc_metrics`). The QC metrics include: + +- **Percentage of reads aligned to the reference genome** (`snippy_variants_percent_reads_aligned`). +- **Percentage of the reference genome covered at or above the specified depth threshold** (`snippy_variants_percent_ref_coverage`). + +These per-sample QC metrics can be combined into a single file (`snippy_combined_qc_metrics`) in downstream workflows, such as `snippy_tree_wf`, providing an overview of QC metrics across all samples. + ### Outputs !!! tip "Visualize your outputs in IGV" diff --git a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl index a8da380b4..eea38ac1d 100644 --- a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl +++ b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl @@ -89,19 +89,52 @@ task snippy_variants { if [ "$reference_length" -eq 0 ]; then echo "Could not compute percent reference coverage: reference length is 0" > PERCENT_REF_COVERAGE else - # compute percent reference coverage - echo $reference_length_passed_depth $reference_length | awk '{ print ($1/$2)*100 }' > PERCENT_REF_COVERAGE + echo $reference_length_passed_depth $reference_length | awk '{ printf("%.2f", ($1/$2)*100) }' > PERCENT_REF_COVERAGE fi # Compute percentage of reads aligned reads_aligned=$(cat READS_ALIGNED_TO_REFERENCE) total_reads=$(samtools view -c "~{samplename}/~{samplename}.bam") + echo $total_reads > TOTAL_READS if [ "$total_reads" -eq 0 ]; then echo "Could not compute percent reads aligned: total reads is 0" > PERCENT_READS_ALIGNED else - echo $reads_aligned $total_reads | awk '{ print ($1/$2)*100 }' > PERCENT_READS_ALIGNED + echo $reads_aligned $total_reads | awk '{ printf("%.2f", ($1/$2)*100) }' > PERCENT_READS_ALIGNED fi + # Create QC metrics file + line_count=$(wc -l < "~{samplename}/~{samplename}_coverage.tsv") + # Check the number of lines in the coverage file, to consider scenarios e.g. for V. cholerae that has two chromosomes and therefore coverage metrics per chromosome + if [ "$line_count" -eq 2 ]; then + head -n 1 "~{samplename}/~{samplename}_coverage.tsv" | tr ' ' '\t' > COVERAGE_HEADER + sed -n '2p' "~{samplename}/~{samplename}_coverage.tsv" | tr ' ' '\t' > COVERAGE_VALUES + elif [ "$line_count" -gt 2 ]; then + # Multiple chromosomes (header + multiple data lines) + header=$(head -n 1 "~{samplename}/~{samplename}_coverage.tsv") + output_header="" + output_values="" + # while loop to iterate over each line in the coverage file + while read -r line; do + if [ -z "$output_header" ]; then + output_header="$header" + output_values="$line" + else + output_header="$output_header\t$header" + output_values="$output_values\t$line" + fi + done < <(tail -n +2 "~{samplename}/~{samplename}_coverage.tsv") + echo "$output_header" | tr ' ' '\t' > COVERAGE_HEADER + echo "$output_values" | tr ' ' '\t' > COVERAGE_VALUES + else + # Coverage file has insufficient data + echo "Coverage file has insufficient data." > COVERAGE_HEADER + echo "" > COVERAGE_VALUES + fi + + # Build the QC metrics file + echo -e "samplename\treads_aligned_to_reference\ttotal_reads\tpercent_reads_aligned\tvariants_total\tpercent_ref_coverage\t$(cat COVERAGE_HEADER)" > "~{samplename}/~{samplename}_qc_metrics.tsv" + echo -e "~{samplename}\t$reads_aligned\t$total_reads\t$(cat PERCENT_READS_ALIGNED)\t$(cat VARIANTS_TOTAL)\t$(cat PERCENT_REF_COVERAGE)\t$(cat COVERAGE_VALUES)" >> "~{samplename}/~{samplename}_qc_metrics.tsv" + >>> output { String snippy_variants_version = read_string("VERSION") @@ -120,6 +153,7 @@ task snippy_variants { String snippy_variants_ref_length = read_string("REFERENCE_LENGTH") String snippy_variants_ref_length_passed_depth = read_string("REFERENCE_LENGTH_PASSED_DEPTH") String snippy_variants_percent_ref_coverage = read_string("PERCENT_REF_COVERAGE") + File snippy_variants_qc_metrics = "~{samplename}/~{samplename}_qc_metrics.tsv" String snippy_variants_percent_reads_aligned = read_string("PERCENT_READS_ALIGNED") } runtime { diff --git a/workflows/phylogenetics/wf_snippy_streamline.wdl b/workflows/phylogenetics/wf_snippy_streamline.wdl index cc453cf8a..f62043518 100644 --- a/workflows/phylogenetics/wf_snippy_streamline.wdl +++ b/workflows/phylogenetics/wf_snippy_streamline.wdl @@ -50,7 +50,8 @@ workflow snippy_streamline { tree_name = tree_name_updated, snippy_variants_outdir_tarball = snippy_variants_wf.snippy_variants_outdir_tarball, samplenames = samplenames, - reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]) + reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]), + snippy_variants_qc_metrics = snippy_variants_wf.snippy_variants_qc_metrics } call versioning.version_capture { input: @@ -111,5 +112,6 @@ workflow snippy_streamline { File? snippy_filtered_metadata = snippy_tree_wf.snippy_filtered_metadata File? snippy_concatenated_variants = snippy_tree_wf.snippy_concatenated_variants File? snippy_shared_variants_table = snippy_tree_wf.snippy_shared_variants_table + File? snippy_combined_qc_metrics = snippy_tree_wf.snippy_combined_qc_metrics } } \ No newline at end of file diff --git a/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl b/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl index 1cc3568ab..d4b580691 100644 --- a/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl +++ b/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl @@ -48,7 +48,8 @@ workflow snippy_streamline_fasta { tree_name = tree_name_updated, snippy_variants_outdir_tarball = snippy_variants_wf.snippy_variants_outdir_tarball, samplenames = samplenames, - reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]) + reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]), + snippy_variants_qc_metrics = snippy_variants_wf.snippy_variants_qc_metrics } call versioning.version_capture { input: @@ -106,5 +107,6 @@ workflow snippy_streamline_fasta { File? snippy_filtered_metadata = snippy_tree_wf.snippy_filtered_metadata File? snippy_concatenated_variants = snippy_tree_wf.snippy_concatenated_variants File? snippy_shared_variants_table = snippy_tree_wf.snippy_shared_variants_table + File? snippy_combined_qc_metrics = snippy_tree_wf.snippy_combined_qc_metrics } } \ No newline at end of file diff --git a/workflows/phylogenetics/wf_snippy_tree.wdl b/workflows/phylogenetics/wf_snippy_tree.wdl index 5fcc04a0a..acd4ec53f 100644 --- a/workflows/phylogenetics/wf_snippy_tree.wdl +++ b/workflows/phylogenetics/wf_snippy_tree.wdl @@ -23,6 +23,7 @@ workflow snippy_tree_wf { Boolean use_gubbins = true Boolean core_genome = true Boolean call_shared_variants = true + Array[File]? snippy_variants_qc_metrics String? data_summary_terra_project String? data_summary_terra_workspace @@ -186,6 +187,14 @@ workflow snippy_tree_wf { concatenated_file_name = tree_name_updated } } + if (defined(snippy_variants_qc_metrics)) { + call file_handling.cat_files as concatenate_qc_metrics { + input: + files_to_cat = select_first([snippy_variants_qc_metrics]), + concatenated_file_name = tree_name_updated + "_combined_qc_metrics.tsv", + skip_extra_headers = true + } + } call versioning.version_capture { input: } @@ -233,5 +242,8 @@ workflow snippy_tree_wf { # shared snps outputs File? snippy_concatenated_variants = concatenate_variants.concatenated_variants File? snippy_shared_variants_table = shared_variants.shared_variants_table + + # combined qc metrics + File? snippy_combined_qc_metrics = concatenate_qc_metrics.concatenated_files } } diff --git a/workflows/standalone_modules/wf_snippy_variants.wdl b/workflows/standalone_modules/wf_snippy_variants.wdl index 381043679..e53e5f770 100644 --- a/workflows/standalone_modules/wf_snippy_variants.wdl +++ b/workflows/standalone_modules/wf_snippy_variants.wdl @@ -70,6 +70,7 @@ workflow snippy_variants_wf { File snippy_variants_coverage_tsv = snippy_variants.snippy_variants_coverage_tsv Int snippy_variants_num_variants = snippy_variants.snippy_variants_num_variants Float snippy_variants_percent_ref_coverage = snippy_variants.snippy_variants_percent_ref_coverage + File snippy_variants_qc_metrics = snippy_variants.snippy_variants_qc_metrics Float snippy_variants_percent_reads_aligned = snippy_variants.snippy_variants_percent_reads_aligned # snippy gene query outputs String? snippy_variants_query = snippy_gene_query.snippy_variants_query