diff --git a/docs/workflows/genomic_characterization/freyja.md b/docs/workflows/genomic_characterization/freyja.md index f93428521..c5b15c54d 100644 --- a/docs/workflows/genomic_characterization/freyja.md +++ b/docs/workflows/genomic_characterization/freyja.md @@ -117,6 +117,7 @@ This workflow runs on the sample level. | freyja | **number_bootstraps** | Int | The number of bootstraps to perform (only used if bootstrap = true) | 100 | Optional | | freyja | **update_db** | Boolean | Updates the Freyja reference files (the usher barcodes and lineage metadata files) but will not save them as output (use Freyja_Update for that purpose). If set to true, the `freyja_lineage_metadata` and `freyja_usher_barcodes` files are not required. | FALSE | Optional | | freyja_fastq | **depth_cutoff** | Int | The minimum coverage depth with which to exclude sites below this value and group identical barcodes | 10 | Optional | +| freyja_fastq | **kraken2_target_organism** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | "Severe acute respiratory syndrome coronavirus 2" | Optional | | freyja_fastq | **ont** | Boolean | Indicates if the input data is derived from an ONT instrument. | FALSE | Optional | | freyja_fastq | **read2** | File | The raw reverse-facing FASTQ file (Illumina only) | | Optional | | freyja_fastq | **trimmomatic_minlen** | Int | The minimum length cut-off when performing read cleaning | 25 | Optional | @@ -371,8 +372,8 @@ The main output file used in subsequent Freyja workflows is found under the `fre | kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | ONT, PE, SE | | kraken_report | File | Full Kraken report | ONT, PE, SE | | kraken_report_dehosted | File | Full Kraken report after host removal | ONT, PE, SE | -| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | ONT, PE, SE | -| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | ONT, PE, SE | +| kraken_sc2 | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software | ONT, PE, SE | +| kraken_sc2_dehosted | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | ONT, PE, SE | | kraken_version | String | Version of Kraken software used | ONT, PE, SE | | minimap2_docker | String | Docker image used to run minimap2 | ONT | | minimap2_version | String | Version of minimap2 used | ONT | diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index 74f5de668..812ccc53d 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -221,14 +221,14 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | ivar_consensus | **stats_n_coverage_primtrim_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | SE,PE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | | kraken2_dehosted | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | | kraken2_dehosted | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | -| kraken2_dehosted | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | -| kraken2_dehosted | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **kraken2_db** | File | The database used to run Kraken2. Must contain viral and human sequences. | "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" | Optional | CL | sars-cov-2 | | kraken2_dehosted | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | | kraken2_dehosted | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | | kraken2_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | | kraken2_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | -| kraken2_raw | **docker_image** | Int | Docker container used in this task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | -| kraken2_raw | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | +| kraken2_raw | **docker_image** | Int | Docker container used in this task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | CL | sars-cov-2 | +| kraken2_raw | **kraken2_db** | File | The database used to run Kraken2. Must contain viral and human sequences. | "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" | Optional | CL | sars-cov-2 | | kraken2_raw | **memory** | String | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | | kraken2_raw | **read_processing** | String | The tool used for trimming of primers from reads. Options are trimmomatic and fastp | trimmomatic | Optional | | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | kraken2_raw | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | @@ -300,8 +300,8 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | qc_check_task | **gambit_predicted_taxon** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **kraken_human** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | | qc_check_task | **kraken_human_dehosted** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | -| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_sc2** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_sc2_dehosted** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | @@ -341,7 +341,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | read_QC_trim | **call_midas** | Boolean | True/False variable that determines if the MIDAS task should be called. | TRUE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **downsampling_coverage** | Float | The desired coverage to sub-sample the reads to with RASUSA | 150 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **fastp_args** | String | Additional fastp task arguments | --detect_adapter_for_pe -g -5 20 -3 20 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| read_QC_trim | **kraken_db** | File | The database used to run Kraken2 | /kraken2-db | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken_db** | File | The database used to run Kraken2. Must contain viral and human sequences. | "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **midas_db** | File | The database used by the MIDAS task | gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | @@ -487,6 +487,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo |---|---|---| | gene_locations_bed_file | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed"` | | genome_length_input | sars-cov-2 | `29903` | + | kraken_target_organism_input | sars-cov-2 | `"Severe acute respiratory syndrome coronavirus 2"` | | nextclade_dataset_name_input | sars-cov-2 | `"nextstrain/sars-cov-2/wuhan-hu-1/orfs"` | | nextclade_dataset_tag_input | sars-cov-2 | `"2024-11-19--14-18-53Z"` | | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.31 "`| @@ -580,7 +581,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_a | 16000 | - | kraken_target_organism | rsv_a | Respiratory syncytial virus | + | kraken_target_organism | rsv_a | "Human respiratory syncytial virus A" | | nextclade_dataset_name_input | rsv_a | nextstrain/rsv/a/EPI_ISL_412866 | | nextclade_dataset_tag_input | rsv_a | "2024-11-27--02-51-00Z" | | reference_genome | rsv_a | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta | @@ -596,7 +597,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_b | 16000 | - | kraken_target_organism | rsv_b | "Human orthopneumovirus" | + | kraken_target_organism | rsv_b | "human respiratory syncytial virus" | | nextclade_dataset_name_input | rsv_b | nextstrain/rsv/b/EPI_ISL_1653999 | | nextclade_dataset_tag_input | rsv_b | "2024-11-27--02-51-00Z" | | reference_genome | rsv_b | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta | @@ -726,7 +727,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow !!! info "Database-dependent" - TheiaCoV automatically uses a viral-specific Kraken2 database. + TheiaCoV automatically uses a viral-specific Kraken2 database. This database was generated in-house from RefSeq's viral sequence collection and human genome GRCh38. It's available at `gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz` !!! techdetails "Kraken2 Technical Details" @@ -776,7 +777,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow !!! info "Database-dependent" - TheiaCoV automatically uses a viral-specific Kraken2 database. + TheiaCoV automatically uses a viral-specific Kraken2 database. This database was generated in-house from RefSeq's viral sequence collection and human genome GRCh38. It's available at `gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz` !!! techdetails "Kraken2 Technical Details" @@ -1122,8 +1123,8 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | CL, ONT, PE | | kraken_report | File | Full Kraken report | CL, ONT, PE, SE | | kraken_report_dehosted | File | Full Kraken report after host removal | CL, ONT, PE | -| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | CL, ONT, PE, SE | -| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken_sc2 | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken_sc2_dehosted | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | CL, ONT, PE | | kraken_target_organism | String | Percent of target organism read data detected using the Kraken2 software | CL, ONT, PE, SE | | kraken_target_organism_dehosted | String | Percent of target organism read data detected using the Kraken2 software after host removal | CL, ONT, PE | | kraken_target_organism_name | String | The name of the target organism; e.g., "Monkeypox" or "Human immunodeficiency virus" | CL, ONT, PE, SE | diff --git a/docs/workflows/standalone/ncbi_scrub.md b/docs/workflows/standalone/ncbi_scrub.md index e82b3feea..65537070d 100644 --- a/docs/workflows/standalone/ncbi_scrub.md +++ b/docs/workflows/standalone/ncbi_scrub.md @@ -23,6 +23,7 @@ There are three Kraken2 workflows: | dehost_pe or dehost_se | **read1** | File | | | Required | PE, SE | | dehost_pe or dehost_se | **read2** | File | | | Required | PE | | dehost_pe or dehost_se | **samplename** | String | | | Required | PE, SE | +| dehost_pe or dehost_se | **target_organism** | String | Target organism for Kraken2 reporting | "Severe acute respiratory syndrome coronavirus 2" | Optional | PE, SE | | kraken2 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | PE, SE | | kraken2 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | PE, SE | | kraken2 | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | PE, SE | diff --git a/tasks/taxon_id/contamination/task_kraken2.wdl b/tasks/taxon_id/contamination/task_kraken2.wdl index fb1522c75..4a43106f6 100644 --- a/tasks/taxon_id/contamination/task_kraken2.wdl +++ b/tasks/taxon_id/contamination/task_kraken2.wdl @@ -5,25 +5,39 @@ task kraken2_theiacov { File read1 File? read2 String samplename - String kraken2_db = "/kraken2-db" + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" Int cpu = 4 Int memory = 8 String? target_organism Int disk_size = 100 - String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv" + String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db" } command <<< # date and version control date | tee DATE kraken2 --version | head -n1 | tee VERSION num_reads=$(ls *fastq.gz 2> /dev/nul | wc -l) + + # Decompress the Kraken2 database + mkdir db + tar -C ./db/ -xzvf ~{kraken2_db} + if ! [ -z ~{read2} ]; then mode="--paired" fi echo $mode - kraken2 $mode \ + + # determine if reads are compressed + if [[ ~{read1} == *.gz ]]; then + echo "Reads are compressed..." + compressed="--gzip-compressed" + fi + echo $compressed + + # Run Kraken2 + kraken2 $mode $compressed \ --threads ~{cpu} \ - --db ~{kraken2_db} \ + --db ./db/ \ ~{read1} ~{read2} \ --report ~{samplename}_kraken2_report.txt \ --output ~{samplename}.classifiedreads.txt @@ -31,22 +45,29 @@ task kraken2_theiacov { # Compress and cleanup gzip ~{samplename}.classifiedreads.txt + # capture human percentage percentage_human=$(grep "Homo sapiens" ~{samplename}_kraken2_report.txt | cut -f 1) - # | tee PERCENT_HUMAN - percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}_kraken2_report.txt | cut -f1 ) - # | tee PERCENT_COV if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi - if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi echo $percentage_human | tee PERCENT_HUMAN - echo $percentage_sc2 | tee PERCENT_SC2 - # capture target org percentage + + # capture target org percentage if [ ! -z "~{target_organism}" ]; then echo "Target org designated: ~{target_organism}" - percent_target_organism=$(grep "~{target_organism}" ~{samplename}_kraken2_report.txt | cut -f1 | head -n1 ) - if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi - else + # if target organisms is sc2, report it in a special legacy column called PERCENT_SC2 + if [[ "~{target_organism}" == "Severe acute respiratory syndrome coronavirus 2" ]]; then + percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}_kraken2_report.txt | cut -f1 ) + percent_target_organism="" + if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi + else + percentage_sc2="" + percent_target_organism=$(grep "~{target_organism}" ~{samplename}_kraken2_report.txt | cut -f1 | head -n1 ) + if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi + fi + else percent_target_organism="" + percentage_sc2="" fi + echo $percentage_sc2 | tee PERCENT_SC2 echo $percent_target_organism | tee PERCENT_TARGET_ORGANISM >>> @@ -55,7 +76,7 @@ task kraken2_theiacov { String version = read_string("VERSION") File kraken_report = "~{samplename}_kraken2_report.txt" Float percent_human = read_float("PERCENT_HUMAN") - Float percent_sc2 = read_float("PERCENT_SC2") + String percent_sc2 = read_string("PERCENT_SC2") String percent_target_organism = read_string("PERCENT_TARGET_ORGANISM") String? kraken_target_organism = target_organism File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz" @@ -205,30 +226,37 @@ task kraken2_parse_classified { CODE # theiacov parsing blocks - percent human, sc2 and target organism + # capture human percentage percentage_human=$(grep "Homo sapiens" ~{samplename}.report_parsed.txt | cut -f 1) - percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report_parsed.txt | cut -f1 ) - if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi - if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi echo $percentage_human | tee PERCENT_HUMAN - echo $percentage_sc2 | tee PERCENT_SC2 - # capture target org percentage - if [ ! -z "~{target_organism}" ]; then + # capture target org percentage + if [ ! -z "~{target_organism}" ]; then echo "Target org designated: ~{target_organism}" - percent_target_organism=$(grep "~{target_organism}" ~{samplename}.report_parsed.txt | cut -f1 | head -n1 ) - if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi - else + # if target organisms is sc2, report it in a special legacy column called PERCENT_SC2 + if [[ "~{target_organism}" == "Severe acute respiratory syndrome coronavirus 2" ]]; then + percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report_parsed.txt | cut -f1 ) + percent_target_organism="" + if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi + else + percentage_sc2="" + percent_target_organism=$(grep "~{target_organism}" ~{samplename}.report_parsed.txt | cut -f1 | head -n1 ) + if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi + fi + else percent_target_organism="" + percentage_sc2="" fi - echo $percent_target_organism | tee PERCENT_TARGET_ORG + echo $percentage_sc2 | tee PERCENT_SC2 + echo $percent_target_organism | tee PERCENT_TARGET_ORGANISM >>> output { File kraken_report = "~{samplename}.report_parsed.txt" Float percent_human = read_float("PERCENT_HUMAN") - Float percent_sc2 = read_float("PERCENT_SC2") - String percent_target_organism = read_string("PERCENT_TARGET_ORG") + String percent_sc2 = read_string("PERCENT_SC2") + String percent_target_organism = read_string("PERCENT_TARGET_ORGANISM") String? kraken_target_organism = target_organism } runtime { diff --git a/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz b/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz new file mode 100644 index 000000000..4dc2a5ec2 Binary files /dev/null and b/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz differ diff --git a/tests/inputs/theiacov/wf_theiacov_clearlabs.json b/tests/inputs/theiacov/wf_theiacov_clearlabs.json index 10351330a..ff8983dd2 100644 --- a/tests/inputs/theiacov/wf_theiacov_clearlabs.json +++ b/tests/inputs/theiacov/wf_theiacov_clearlabs.json @@ -3,5 +3,7 @@ "theiacov_clearlabs.read1": "tests/data/theiacov/fastqs/clearlabs/clearlabs.fastq.gz", "theiacov_clearlabs.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_clearlabs.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", - "theiacov_clearlabs.organism_parameters.gene_locations_bed_file": "tests/inputs/sc2_gene_locations.bed" + "theiacov_clearlabs.organism_parameters.gene_locations_bed_file": "tests/inputs/sc2_gene_locations.bed", + "theiacov_clearlabs.kraken2_raw.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz", + "theiacov_clearlabs.kraken2_dehosted.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_illumina_pe.json b/tests/inputs/theiacov/wf_theiacov_illumina_pe.json index 467bcf94d..d57d12ad4 100644 --- a/tests/inputs/theiacov/wf_theiacov_illumina_pe.json +++ b/tests/inputs/theiacov/wf_theiacov_illumina_pe.json @@ -5,5 +5,6 @@ "theiacov_illumina_pe.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_illumina_pe.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", "theiacov_illumina_pe.reference_gff": "tests/inputs/completely-empty-for-test.txt", - "theiacov_illumina_pe.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_illumina_pe.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_illumina_pe.read_QC_trim.kraken_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_illumina_se.json b/tests/inputs/theiacov/wf_theiacov_illumina_se.json index b9b4381de..7bc27de4b 100644 --- a/tests/inputs/theiacov/wf_theiacov_illumina_se.json +++ b/tests/inputs/theiacov/wf_theiacov_illumina_se.json @@ -4,5 +4,6 @@ "theiacov_illumina_se.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_illumina_se.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", "theiacov_illumina_se.reference_gff": "tests/inputs/completely-empty-for-test.txt", - "theiacov_illumina_se.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_illumina_se.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_illumina_se.read_QC_trim.kraken_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_ont.json b/tests/inputs/theiacov/wf_theiacov_ont.json index 4c551d73b..055ca29d0 100644 --- a/tests/inputs/theiacov/wf_theiacov_ont.json +++ b/tests/inputs/theiacov/wf_theiacov_ont.json @@ -3,5 +3,6 @@ "theiacov_ont.read1": "tests/data/theiacov/fastqs/ont/ont.fastq.gz", "theiacov_ont.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_ont.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", - "theiacov_ont.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_ont.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_ont.read_qc_trim.kraken_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml index f8f24919d..83d78611b 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml @@ -147,7 +147,7 @@ - path: miniwdl_run/call-fastq_scan_raw_reads/work/clearlabs_fastq-scan.json md5sum: 869dd2e934c600bba35f30f08e2da7c9 - path: miniwdl_run/call-kraken2_dehosted/command - md5sum: 0f9db3341b5f58fb8d145d6d94222827 + md5sum: 4306699c67306b103561adf31c3754e3 - path: miniwdl_run/call-kraken2_dehosted/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-kraken2_dehosted/outputs.json @@ -159,18 +159,18 @@ contains: ["wdl", "theiacov_clearlabs", "kraken2_dehosted", "done"] - path: miniwdl_run/call-kraken2_dehosted/work/DATE - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_HUMAN - md5sum: 4fd4dcef994592f9865e9bc8807f32f4 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_SC2 - md5sum: 9fc4759d176a0e0d240c418dbaaafeb2 + md5sum: 86b6b8aa9ad17f169f04c02b0e2bf1b1 - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-kraken2_dehosted/work/VERSION - md5sum: 379b99c23325315c502e74614c035e7d + md5sum: 7ad46f90cd0ffa94f32a6e06299ed05c - path: miniwdl_run/call-kraken2_dehosted/work/_miniwdl_inputs/0/clearlabs_R1_dehosted.fastq.gz - path: miniwdl_run/call-kraken2_dehosted/work/clearlabs_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 + md5sum: b66dbcf8d229c1b6fcfff4dd786068bd - path: miniwdl_run/call-kraken2_raw/command - md5sum: a9dabf08bff8e183fd792901ce24fc57 + md5sum: d6e217901b67290466eec97f13564022 - path: miniwdl_run/call-kraken2_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-kraken2_raw/outputs.json @@ -182,16 +182,16 @@ contains: ["wdl", "theiacov_clearlabs", "kraken2_raw", "done"] - path: miniwdl_run/call-kraken2_raw/work/DATE - path: miniwdl_run/call-kraken2_raw/work/PERCENT_HUMAN - md5sum: 4fd4dcef994592f9865e9bc8807f32f4 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-kraken2_raw/work/PERCENT_SC2 - md5sum: 9fc4759d176a0e0d240c418dbaaafeb2 + md5sum: 86b6b8aa9ad17f169f04c02b0e2bf1b1 - path: miniwdl_run/call-kraken2_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-kraken2_raw/work/VERSION - md5sum: 379b99c23325315c502e74614c035e7d + md5sum: 7ad46f90cd0ffa94f32a6e06299ed05c - path: miniwdl_run/call-kraken2_raw/work/_miniwdl_inputs/0/clearlabs.fastq.gz - path: miniwdl_run/call-kraken2_raw/work/clearlabs_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 + md5sum: b66dbcf8d229c1b6fcfff4dd786068bd - path: miniwdl_run/call-ncbi_scrub_se/command contains: ["read1", "scrubber", "gzip"] - path: miniwdl_run/call-ncbi_scrub_se/inputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml index 98a1e0ad9..d2e0c64f9 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml @@ -83,7 +83,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION # kraken2 dehosted - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/command - md5sum: 2031501aaf268d2987b6dbc3b8b32dfa + md5sum: 24a53d050f62bf377558e76cce42ca71 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/inputs.json contains: ["read1", "read2", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/outputs.json @@ -96,14 +96,14 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_HUMAN md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_SC2 - md5sum: 494a4bf9ab740c0a0fab64f670549883 + md5sum: 6baf8bb11094b9011d8dc34e66743712 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/SRR13687078_kraken2_report.txt - md5sum: 2ccc036a9a93b3cf096a5c4dda49a579 + md5sum: 565954ac2bb6ef427754de3b43430728 # kraken2 raw - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/command - md5sum: a16205bdb8cf133a112c4552e8f67f97 + md5sum: 717f1ade3930083c4ca023b999c3bdff - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/outputs.json @@ -114,13 +114,13 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/task.log contains: ["wdl", "theiacov_illumina_pe", "kraken2_theiacov_raw", "done"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_HUMAN - md5sum: 414f4efa514540a2527a4f27124575f2 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_SC2 - md5sum: 2bf2d20f083d8fa09abf6c25f8970e2e + md5sum: cfefab882d84cf0f2a1bde9c19eec318 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/SRR13687078_kraken2_report.txt - md5sum: 3544d9ca35d45093c03cdead46677765 + md5sum: 8ea92e13d401e1c955336edfdcd4f1ba # ncbi scrub - path: miniwdl_run/call-read_QC_trim/call-ncbi_scrub_pe/command md5sum: 8c7ca800fa98305009cfb9116a4b60b8 diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml index 3879c19ee..362fa45d0 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml @@ -73,7 +73,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION # kraken2 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/command - md5sum: ca22e45a62c5c26c4447cdafe75a26ab + md5sum: 3478232c364dc1cf01b6b0300400c26c - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/outputs.json @@ -84,13 +84,13 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/task.log contains: ["wdl", "theiacov_illumina_se", "kraken2_theiacov_raw", "done"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_HUMAN - md5sum: 1576d5d341223ea9d44b0b8a213bb9da + md5sum: 4fd4dcef994592f9865e9bc8807f32f4 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_SC2 - md5sum: 7cc2eb659e21f15fa902b11812eae1f6 + md5sum: adbe14d7547234f3743f80907ed33179 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/ERR6319327_kraken2_report.txt - md5sum: 9a089b8920e55c9cc7bc8cd7d18f9a8e + md5sum: cb58af9eb139d109b55ce65d6d2344d6 # clean read screen - path: miniwdl_run/call-clean_check_reads/command md5sum: 80a361915a627e86743baacfc383b2b5 diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index aad099a4e..38f979119 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -623,11 +623,11 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 43dd0613df879f91a2f3144e27b38a71 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 850ad97598aca5c28eb36e6a5c13c2fc + md5sum: 8c97c5bd65e2787239f12ef425d479ae - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl md5sum: d8db687487a45536d4837a540ed2a135 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 6a7e2a86a..e12ec8ec0 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -586,16 +586,16 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 43dd0613df879f91a2f3144e27b38a71 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 850ad97598aca5c28eb36e6a5c13c2fc + md5sum: 8c97c5bd65e2787239f12ef425d479ae - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl md5sum: 4111a758490174325ae8ea52a95319e9 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: ea5cff6eff8c2c42046cf2eae6f16b6f - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl - md5sum: a7ef5a7a38dd60ff2edf699ae6808ebb + md5sum: 09d9f68b9ca8bf94b6145ff9bed2edd1 - path: miniwdl_run/workflow.log contains: ["wdl", "theiaprok_illumina_se", "NOTICE", "done"] diff --git a/workflows/freyja/wf_freyja_fastq.wdl b/workflows/freyja/wf_freyja_fastq.wdl index 7b46a204c..2e0fe755e 100644 --- a/workflows/freyja/wf_freyja_fastq.wdl +++ b/workflows/freyja/wf_freyja_fastq.wdl @@ -22,6 +22,7 @@ workflow freyja_fastq { String samplename Int? depth_cutoff Boolean ont = false + String kraken2_target_organism = "Severe acute respiratory syndrome coronavirus 2" } if (defined(read2)) { call read_qc_pe.read_QC_trim_pe as read_QC_trim_pe { @@ -30,7 +31,8 @@ workflow freyja_fastq { read1 = read1, read2 = select_first([read2]), trim_min_length = trimmomatic_min_length, - workflow_series = "theiacov" + workflow_series = "theiacov", + target_organism = kraken2_target_organism } } if (! defined(read2) && ! ont) { @@ -39,7 +41,8 @@ workflow freyja_fastq { samplename = samplename, read1 = read1, trim_min_length = trimmomatic_min_length, - workflow_series = "theiacov" + workflow_series = "theiacov", + target_organism = kraken2_target_organism } } if (ont) { @@ -57,7 +60,8 @@ workflow freyja_fastq { input: samplename = samplename, read1 = read1, - workflow_series = "theiacov" + workflow_series = "theiacov", + target_organism = kraken2_target_organism } call nanoplot_task.nanoplot as nanoplot_clean { input: @@ -177,10 +181,10 @@ workflow freyja_fastq { # Read QC - kraken outputs - all String kraken_version = select_first([read_QC_trim_pe.kraken_version, read_QC_trim_se.kraken_version, read_QC_trim_ont.kraken_version]) Float kraken_human = select_first([read_QC_trim_pe.kraken_human, read_QC_trim_se.kraken_human, read_QC_trim_ont.kraken_human]) - Float kraken_sc2 = select_first([read_QC_trim_pe.kraken_sc2, read_QC_trim_se.kraken_sc2, read_QC_trim_ont.kraken_sc2]) + String kraken_sc2 = select_first([read_QC_trim_pe.kraken_sc2, read_QC_trim_se.kraken_sc2, read_QC_trim_ont.kraken_sc2]) String kraken_report = select_first([read_QC_trim_pe.kraken_report, read_QC_trim_se.kraken_report, read_QC_trim_ont.kraken_report]) Float kraken_human_dehosted = select_first([read_QC_trim_pe.kraken_human_dehosted, read_QC_trim_se.kraken_human_dehosted, read_QC_trim_ont.kraken_human_dehosted]) - Float kraken_sc2_dehosted = select_first([read_QC_trim_pe.kraken_sc2_dehosted, read_QC_trim_se.kraken_sc2_dehosted, read_QC_trim_ont.kraken_sc2_dehosted]) + String kraken_sc2_dehosted = select_first([read_QC_trim_pe.kraken_sc2_dehosted, read_QC_trim_se.kraken_sc2_dehosted, read_QC_trim_ont.kraken_sc2_dehosted]) File kraken_report_dehosted = select_first([read_QC_trim_pe.kraken_report_dehosted, read_QC_trim_se.kraken_report_dehosted, read_QC_trim_ont.kraken_report_dehosted]) # Read Alignment - bwa outputs String? bwa_version = bwa.bwa_version diff --git a/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl b/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl index 3cbedd30a..6d4acd8ca 100644 --- a/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl +++ b/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl @@ -9,6 +9,7 @@ workflow dehost_pe { String samplename File read1 File read2 + String target_organism = "Severe acute respiratory syndrome coronavirus 2" } call ncbi_scrub.ncbi_scrub_pe { input: @@ -20,7 +21,8 @@ workflow dehost_pe { input: samplename = samplename, read1 = ncbi_scrub_pe.read1_dehosted, - read2 = ncbi_scrub_pe.read2_dehosted + read2 = ncbi_scrub_pe.read2_dehosted, + target_organism = target_organism } call versioning.version_capture { input: @@ -33,7 +35,7 @@ workflow dehost_pe { Int ncbi_scrub_human_spots_removed = ncbi_scrub_pe.human_spots_removed String ncbi_scrub_docker = ncbi_scrub_pe.ncbi_scrub_docker Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 + String kraken_sc2_dehosted = kraken2.percent_sc2 File kraken_report_dehosted = kraken2.kraken_report String kraken_version_dehosted = kraken2.version } diff --git a/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl b/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl index 7e0a25d88..23a8d707d 100644 --- a/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl +++ b/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl @@ -8,6 +8,7 @@ workflow dehost_se { input { String samplename File read1 + String target_organism = "Severe acute respiratory syndrome coronavirus 2" } call ncbi_scrub.ncbi_scrub_se { input: @@ -17,7 +18,8 @@ workflow dehost_se { call kraken.kraken2_theiacov as kraken2 { input: samplename = samplename, - read1 = ncbi_scrub_se.read1_dehosted + read1 = ncbi_scrub_se.read1_dehosted, + target_organism = target_organism } call versioning.version_capture { input: @@ -29,7 +31,7 @@ workflow dehost_se { String ncbi_scrub_docker = ncbi_scrub_se.ncbi_scrub_docker Int ncbi_scrub_human_spots_removed = ncbi_scrub_se.human_spots_removed Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 + String kraken_sc2_dehosted = kraken2.percent_sc2 String kraken_version_dehosted = kraken2.version File kraken_report_dehosted = kraken2.kraken_report } diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl index 5774e02f7..d63f61c0f 100644 --- a/workflows/theiacov/wf_theiacov_clearlabs.wdl +++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl @@ -176,12 +176,12 @@ workflow theiacov_clearlabs { # Read QC - kraken outputs String kraken_version = kraken2_raw.version Float kraken_human = kraken2_raw.percent_human - Float kraken_sc2 = kraken2_raw.percent_sc2 + String kraken_sc2 = kraken2_raw.percent_sc2 String kraken_target_organism = kraken2_raw.percent_target_organism String kraken_target_organism_name = organism_parameters.kraken_target_organism File kraken_report = kraken2_raw.kraken_report Float kraken_human_dehosted = kraken2_dehosted.percent_human - Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 + String kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 String kraken_target_organism_dehosted = kraken2_dehosted.percent_target_organism File kraken_report_dehosted = kraken2_dehosted.kraken_report # Read Alignment - Artic consensus outputs diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index 29585659e..7bf1fc36a 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -293,12 +293,12 @@ workflow theiacov_illumina_pe { # Read QC - kraken outputs String? kraken_version = read_QC_trim.kraken_version Float? kraken_human = read_QC_trim.kraken_human - Float? kraken_sc2 = read_QC_trim.kraken_sc2 + String? kraken_sc2 = read_QC_trim.kraken_sc2 String? kraken_target_organism = read_QC_trim.kraken_target_organism String? kraken_target_organism_name = read_QC_trim.kraken_target_organism_name File? kraken_report = read_QC_trim.kraken_report Float? kraken_human_dehosted = read_QC_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted + String? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted String? kraken_target_organism_dehosted = read_QC_trim.kraken_target_organism_dehosted File? kraken_report_dehosted = read_QC_trim.kraken_report_dehosted # Read Alignment - bwa outputs diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index 0de516664..0a92ef2fc 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -236,12 +236,12 @@ workflow theiacov_illumina_se { # Read QC - kraken outputs String? kraken_version = read_QC_trim.kraken_version Float? kraken_human = read_QC_trim.kraken_human - Float? kraken_sc2 = read_QC_trim.kraken_sc2 + String? kraken_sc2 = read_QC_trim.kraken_sc2 String? kraken_target_organism = read_QC_trim.kraken_target_organism String? kraken_target_organism_name = read_QC_trim.kraken_target_organism_name File? kraken_report = read_QC_trim.kraken_report Float? kraken_human_dehosted = read_QC_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted + String? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted String? kraken_target_organism_dehosted = read_QC_trim.kraken_target_organism_dehosted File? kraken_report_dehosted = read_QC_trim.kraken_report_dehosted # Read Alignment - bwa outputs diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index 7d8d29ad4..d2bab2ad7 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -288,12 +288,12 @@ workflow theiacov_ont { String? kraken_target_organism_name = read_qc_trim.kraken_target_organism_name # Read QC - kraken outputs raw Float? kraken_human = read_qc_trim.kraken_human - Float? kraken_sc2 = read_qc_trim.kraken_sc2 + String? kraken_sc2 = read_qc_trim.kraken_sc2 String? kraken_target_organism = read_qc_trim.kraken_target_organism File? kraken_report = read_qc_trim.kraken_report # Read QC - kraken outputs dehosted Float? kraken_human_dehosted = read_qc_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_qc_trim.kraken_sc2_dehosted + String? kraken_sc2_dehosted = read_qc_trim.kraken_sc2_dehosted String? kraken_target_organism_dehosted = read_qc_trim.kraken_target_organism_dehosted File? kraken_report_dehosted = read_qc_trim.kraken_report_dehosted # Read Alignment - Artic consensus outputs diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index 6c2dfc849..65c513cb8 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -54,6 +54,7 @@ workflow organism_parameters { String sc2_gene_locations_bed = "gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed" String sc2_nextclade_ds_tag = "2024-11-19--14-18-53Z" String sc2_nextclade_ds_name = "nextstrain/sars-cov-2/wuhan-hu-1/orfs" + String sc2_kraken_target_organism = "Severe acute respiratory syndrome coronavirus 2" String sc2_pangolin_docker = "us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.31" Int sc2_genome_len = 29903 Int sc2_vadr_max_length = 30000 @@ -200,7 +201,7 @@ workflow organism_parameters { String rsv_a_nextclade_ds_tag = "2024-11-27--02-51-00Z" String rsv_a_nextclade_ds_name = "nextstrain/rsv/a/EPI_ISL_412866" Int rsv_a_genome_len = 15500 - String rsv_a_kraken_target_organism = "Respiratory syncytial virus" + String rsv_a_kraken_target_organism = "Human respiratory syncytial virus A" String rsv_a_vadr_options = "-r --mkey rsv --xnocomp" Int rsv_a_vadr_max_length = 15500 Int rsv_a_vadr_skip_length = 5000 @@ -224,7 +225,7 @@ workflow organism_parameters { String rsv_b_nextclade_ds_tag = "2024-11-27--02-51-00Z" String rsv_b_nextclade_ds_name = "nextstrain/rsv/b/EPI_ISL_1653999" Int rsv_b_genome_len = 15500 - String rsv_b_kraken_target_organism = "Human orthopneumovirus" + String rsv_b_kraken_target_organism = "human respiratory syncytial virus" String rsv_b_vadr_options = "-r --mkey rsv --xnocomp" Int rsv_b_vadr_max_length = 15500 Int rsv_b_vadr_skip_length = 5000 @@ -279,7 +280,7 @@ workflow organism_parameters { Int vadr_memory = select_first([vadr_mem, sc2_vadr_memory, mpox_vadr_memory, wnv_vadr_memory, flu_vadr_memory, rsv_a_vadr_memory, rsv_b_vadr_memory, 0]) Int vadr_skiplength = select_first([vadr_skip_length, sc2_vadr_skip_length, mpox_vadr_skip_length, wnv_vadr_skip_length, flu_vadr_skip_length, rsv_a_vadr_skip_length, rsv_b_vadr_skip_length, 0]) # kraken options - String kraken_target_organism = select_first([kraken_target_organism_input, mpox_kraken_target_organism, wnv_kraken_target_organism, hiv_v1_target_organism, hiv_v2_target_organism, rsv_a_kraken_target_organism, rsv_b_kraken_target_organism, ""]) + String kraken_target_organism = select_first([kraken_target_organism_input, sc2_kraken_target_organism, mpox_kraken_target_organism, wnv_kraken_target_organism, hiv_v1_target_organism, hiv_v2_target_organism, rsv_a_kraken_target_organism, rsv_b_kraken_target_organism, ""]) # augur options Int augur_min_num_unambig = select_first([min_num_unambig, mpox_min_num_unambig, flu_min_num_unambig, rsv_a_min_num_unambig, rsv_b_min_num_unambig, 0]) File augur_clades_tsv = select_first([clades_tsv, h1n1_ha_clades_tsv, h3n2_ha_clades_tsv, vic_ha_clades_tsv, yam_ha_clades_tsv, h5n1_ha_clades_tsv, rsv_a_clades_tsv, rsv_b_clades_tsv, mpox_clades_tsv, "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-clades.tsv"]) diff --git a/workflows/utilities/wf_read_QC_trim_ont.wdl b/workflows/utilities/wf_read_QC_trim_ont.wdl index 5b84562aa..8cc609346 100644 --- a/workflows/utilities/wf_read_QC_trim_ont.wdl +++ b/workflows/utilities/wf_read_QC_trim_ont.wdl @@ -56,7 +56,11 @@ workflow read_QC_trim_ont { input: samplename = samplename, read1 = read1, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_raw { input: @@ -69,7 +73,11 @@ workflow read_QC_trim_ont { input: samplename = samplename, read1 = ncbi_scrub_se.read1_dehosted, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_dehosted { input: @@ -126,16 +134,16 @@ workflow read_QC_trim_ont { # ncbi scrub outputs File? read1_dehosted = ncbi_scrub_se.read1_dehosted - # kraken2 - theiacov and theiapro + # kraken2 - theiacov and theiaprok String kraken_version = select_first([kraken2_raw.version, kraken2_se.kraken2_version, ""]) String kraken_docker = select_first([kraken2_raw.docker, kraken2_se.kraken2_docker, ""]) Float? kraken_human = kraken2_recalculate_abundances_raw.percent_human - Float? kraken_sc2 = kraken2_recalculate_abundances_raw.percent_sc2 + String? kraken_sc2 = kraken2_recalculate_abundances_raw.percent_sc2 String? kraken_target_organism = kraken2_recalculate_abundances_raw.percent_target_organism String? kraken_target_organism_name = kraken2_raw.kraken_target_organism String kraken_report = select_first([kraken2_recalculate_abundances_raw.kraken_report, kraken2_recalculate_abundances.kraken_report, ""]) Float? kraken_human_dehosted = kraken2_recalculate_abundances_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_recalculate_abundances_dehosted.percent_sc2 + String? kraken_sc2_dehosted = kraken2_recalculate_abundances_dehosted.percent_sc2 String? kraken_target_organism_dehosted = kraken2_recalculate_abundances_dehosted.percent_target_organism File? kraken_report_dehosted = kraken2_recalculate_abundances_dehosted.kraken_report String kraken_database = select_first([kraken2_raw.database, kraken2_se.kraken2_database, kraken_db_warning, ""]) diff --git a/workflows/utilities/wf_read_QC_trim_pe.wdl b/workflows/utilities/wf_read_QC_trim_pe.wdl index 0d6090036..ee921bc12 100644 --- a/workflows/utilities/wf_read_QC_trim_pe.wdl +++ b/workflows/utilities/wf_read_QC_trim_pe.wdl @@ -52,14 +52,22 @@ workflow read_QC_trim_pe { samplename = samplename, read1 = read1, read2 = read2, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken.kraken2_theiacov as kraken2_theiacov_dehosted { input: samplename = samplename, read1 = select_first([ncbi_scrub_pe.read1_dehosted]), read2 = ncbi_scrub_pe.read2_dehosted, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } } if (read_processing == "trimmomatic") { @@ -196,11 +204,11 @@ workflow read_QC_trim_pe { # kraken2 - theiacov and theiaprok String kraken_version = select_first([kraken2_theiacov_raw.version, kraken2_standalone.kraken2_version, ""]) Float? kraken_human = kraken2_theiacov_raw.percent_human - Float? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 + String? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 String? kraken_target_organism = kraken2_theiacov_raw.percent_target_organism String kraken_report = select_first([kraken2_theiacov_raw.kraken_report, kraken2_standalone.kraken2_report, ""]) Float? kraken_human_dehosted = kraken2_theiacov_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 + String? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 String? kraken_target_organism_dehosted = kraken2_theiacov_dehosted.percent_target_organism String? kraken_target_organism_name = target_organism File? kraken_report_dehosted = kraken2_theiacov_dehosted.kraken_report diff --git a/workflows/utilities/wf_read_QC_trim_se.wdl b/workflows/utilities/wf_read_QC_trim_se.wdl index af147b512..f82d3aae3 100644 --- a/workflows/utilities/wf_read_QC_trim_se.wdl +++ b/workflows/utilities/wf_read_QC_trim_se.wdl @@ -100,13 +100,21 @@ workflow read_QC_trim_se { input: samplename = samplename, read1 = read1, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken.kraken2_theiacov as kraken2_theiacov_dehosted { input: samplename = samplename, read1 = select_first([ncbi_scrub_se.read1_dehosted]), - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } } if ("~{workflow_series}" == "theiaprok") { @@ -163,11 +171,11 @@ workflow read_QC_trim_se { # kraken2 - raw and dehosted String kraken_version = select_first([kraken2_theiacov_raw.version, kraken2_standalone.kraken2_version, ""]) Float? kraken_human = kraken2_theiacov_raw.percent_human - Float? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 + String? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 String? kraken_target_organism = kraken2_theiacov_raw.percent_target_organism String kraken_report = select_first([kraken2_theiacov_raw.kraken_report, kraken2_standalone.kraken2_report, ""]) Float? kraken_human_dehosted = kraken2_theiacov_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 + String? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 String? kraken_target_organism_dehosted = kraken2_theiacov_dehosted.percent_target_organism String? kraken_target_organism_name = target_organism File? kraken_report_dehosted = kraken2_theiacov_dehosted.kraken_report