diff --git a/.github/workflows/build-documentation.yml b/.github/workflows/build-documentation.yml new file mode 100644 index 000000000..2bd74be50 --- /dev/null +++ b/.github/workflows/build-documentation.yml @@ -0,0 +1,38 @@ +name: build-documentation +on: + push: + branches: + - main + +permissions: + contents: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout main + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: 3.x + + - name: Cache dependencies + uses: actions/cache@v2 + with: + key: ${{ github.ref }} + path: .cache + + - name: Install dependencies + run: | + pip install mkdocs-material mkdocs-material-extensions mkdocs-git-revision-date-localized-plugin mike mkdocs-glightbox + + - name: Build documentation + run: | + git config user.name sage-wright + git config user.email sage.wright@theiagen.com + mike deploy --push main \ No newline at end of file diff --git a/.github/workflows/build-version-release.yml b/.github/workflows/build-version-release.yml new file mode 100644 index 000000000..4f9ac3b03 --- /dev/null +++ b/.github/workflows/build-version-release.yml @@ -0,0 +1,39 @@ +name: version-documentation +on: + release: + types: [published] + +permissions: + contents: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout main + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: 3.x + + - name: Cache dependencies + uses: actions/cache@v2 + with: + key: ${{ github.ref }} + path: .cache + + - name: Install dependencies + run: | + pip install mkdocs-material mkdocs-material-extensions mkdocs-git-revision-date-localized-plugin mike mkdocs-glightbox + + - name: Build documentation + run: | + git config user.name sage-wright + git config user.email sage.wright@theiagen.com + LATEST_RELEASE=$(curl -sL https://api.github.com/repos/theiagen/public_health_bioinformatics/releases/latest | jq -r ".tag_name") + mike deploy --push --update-aliases ${LATEST_RELEASE} latest + mike set-default --push latest \ No newline at end of file diff --git a/README.md b/README.md index 060d0ec75..0eb976b16 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The Public Health Bioinformatics Bioinformatics repository contains workflows fo ## Introduction -**More information about the steps undertaken in these workflows is available via the [Theiagen Public Resources Documentation](https://theiagen.notion.site/Theiagen-Public-Health-Resources-a4bd134b0c5c4fe39870e21029a30566).** +**More information about the steps undertaken in these workflows is available via the [Theiagen Public Resources Documentation](https://theiagen.github.io/public_health_bioinformatics/latest/).** Support for running these workflows can be sought by raising a [GitHub issue](https://github.com/theiagen/public_health_bioinformatics/issues/new) or by contacting Theiagen at support@theiagen.com. @@ -16,13 +16,13 @@ The PHB repository contains workflows for the characterization, genomic epidemio All workflows in the PHB repository end with `_PHB` in order to differentiate them from earlier versions and from the original tools they incorporate. -Briefly, the main *genomic characterization* workflows are split by pathogen type: +Briefly, the main _genomic characterization_ workflows are split by pathogen type: 1. **Viral** (***TheiaCoV*** workflows) 2. **Bacterial** (***TheiaProk*** workflows) -3. **Fungal** (***TheiaEuk*** workflows) +3. **Fungal** (*[**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md)* workflows) -Many more workflows are available, and are documented in detail in the [Theiagen Public Resources Documentation](https://theiagen.notion.site/Theiagen-Public-Health-Resources-a4bd134b0c5c4fe39870e21029a30566). +Many more workflows are available, and are documented in detail in the [Theiagen Public Resources Documentation](https://theiagen.github.io/public_health_bioinformatics/latest/). ## On the Shoulder of Giants @@ -83,8 +83,8 @@ The authors declare no conflict of interest. Please cite this paper if publishing work using any workflows: -> Libuit, Kevin G., Emma L. Doughty, James R. Otieno, Frank Ambrosio, Curtis J. Kapsak, Emily A. Smith, Sage M. Wright, et al. 2023. “Accelerating Bioinformatics Implementation in Public Health.” Microbial Genomics 9 (7). https://doi.org/10.1099/mgen.0.001051. +> Libuit, Kevin G., Emma L. Doughty, James R. Otieno, Frank Ambrosio, Curtis J. Kapsak, Emily A. Smith, Sage M. Wright, et al. 2023. "Accelerating Bioinformatics Implementation in Public Health." Microbial Genomics 9 (7). https://doi.org/10.1099/mgen.0.001051. Alternatively, please cite this paper if using the TheiaEuk workflow: -> Ambrosio, Frank, Michelle Scribner, Sage Wright, James Otieno, Emma Doughty, Andrew Gorzalski, Danielle Siao, et al. 2023. “TheiaEuk: A Species-Agnostic Bioinformatics Workflow for Fungal Genomic Characterization.” Frontiers in Public Health 11. https://doi.org/10.3389/fpubh.2023.1198213. +> Ambrosio, Frank, Michelle Scribner, Sage Wright, James Otieno, Emma Doughty, Andrew Gorzalski, Danielle Siao, et al. 2023. "TheiaEuk: A Species-Agnostic Bioinformatics Workflow for Fungal Genomic Characterization." Frontiers in Public Health 11. https://doi.org/10.3389/fpubh.2023.1198213. diff --git a/docs/assets/figures/Core_Gene_SNP.png b/docs/assets/figures/Core_Gene_SNP.png new file mode 100644 index 000000000..b6a393f85 Binary files /dev/null and b/docs/assets/figures/Core_Gene_SNP.png differ diff --git a/docs/assets/figures/Create_Terra_Table_example1.png b/docs/assets/figures/Create_Terra_Table_example1.png new file mode 100644 index 000000000..5118bc881 Binary files /dev/null and b/docs/assets/figures/Create_Terra_Table_example1.png differ diff --git a/docs/assets/figures/Create_Terra_Table_example2.png b/docs/assets/figures/Create_Terra_Table_example2.png new file mode 100644 index 000000000..55dd20bb2 Binary files /dev/null and b/docs/assets/figures/Create_Terra_Table_example2.png differ diff --git a/docs/assets/figures/Find_Shared_Variants_PHB.png b/docs/assets/figures/Find_Shared_Variants_PHB.png new file mode 100644 index 000000000..9b3d61fec Binary files /dev/null and b/docs/assets/figures/Find_Shared_Variants_PHB.png differ diff --git a/docs/assets/figures/Freyja_FASTQ.png b/docs/assets/figures/Freyja_FASTQ.png new file mode 100644 index 000000000..1789c8c53 Binary files /dev/null and b/docs/assets/figures/Freyja_FASTQ.png differ diff --git a/docs/assets/figures/Freyja_figure2.png b/docs/assets/figures/Freyja_figure2.png new file mode 100644 index 000000000..0ad9b24da Binary files /dev/null and b/docs/assets/figures/Freyja_figure2.png differ diff --git a/docs/assets/figures/Freyja_figure3.png b/docs/assets/figures/Freyja_figure3.png new file mode 100644 index 000000000..4a2a8f167 Binary files /dev/null and b/docs/assets/figures/Freyja_figure3.png differ diff --git a/docs/assets/figures/Kraken2.png b/docs/assets/figures/Kraken2.png new file mode 100644 index 000000000..617500bc4 Binary files /dev/null and b/docs/assets/figures/Kraken2.png differ diff --git a/docs/assets/figures/Lyve_Set.png b/docs/assets/figures/Lyve_Set.png new file mode 100644 index 000000000..d9a3a9375 Binary files /dev/null and b/docs/assets/figures/Lyve_Set.png differ diff --git a/docs/assets/figures/Snippy_Streamline.png b/docs/assets/figures/Snippy_Streamline.png new file mode 100644 index 000000000..b373d3a3f Binary files /dev/null and b/docs/assets/figures/Snippy_Streamline.png differ diff --git a/docs/assets/figures/Snippy_Streamline_FASTA.png b/docs/assets/figures/Snippy_Streamline_FASTA.png new file mode 100644 index 000000000..05b752caa Binary files /dev/null and b/docs/assets/figures/Snippy_Streamline_FASTA.png differ diff --git a/docs/assets/figures/Snippy_Variants.png b/docs/assets/figures/Snippy_Variants.png new file mode 100644 index 000000000..679fc163e Binary files /dev/null and b/docs/assets/figures/Snippy_Variants.png differ diff --git a/docs/assets/figures/TheiaCoV.png b/docs/assets/figures/TheiaCoV.png new file mode 100644 index 000000000..553a0f321 Binary files /dev/null and b/docs/assets/figures/TheiaCoV.png differ diff --git a/docs/assets/figures/TheiaEuk_Illumina_PE.png b/docs/assets/figures/TheiaEuk_Illumina_PE.png new file mode 100644 index 000000000..9e131bf6a Binary files /dev/null and b/docs/assets/figures/TheiaEuk_Illumina_PE.png differ diff --git a/docs/assets/figures/TheiaMeta_Illumina_PE.png b/docs/assets/figures/TheiaMeta_Illumina_PE.png new file mode 100644 index 000000000..61c937acc Binary files /dev/null and b/docs/assets/figures/TheiaMeta_Illumina_PE.png differ diff --git a/docs/assets/figures/TheiaProk.png b/docs/assets/figures/TheiaProk.png new file mode 100644 index 000000000..693f902c3 Binary files /dev/null and b/docs/assets/figures/TheiaProk.png differ diff --git a/docs/assets/figures/TheiaValidate.png b/docs/assets/figures/TheiaValidate.png new file mode 100644 index 000000000..2976186df Binary files /dev/null and b/docs/assets/figures/TheiaValidate.png differ diff --git a/docs/assets/figures/Workflow_Relationships.png b/docs/assets/figures/Workflow_Relationships.png new file mode 100644 index 000000000..3bfda7512 Binary files /dev/null and b/docs/assets/figures/Workflow_Relationships.png differ diff --git a/docs/assets/figures/Workflow_Relationships_dark.png b/docs/assets/figures/Workflow_Relationships_dark.png new file mode 100644 index 000000000..6e4676264 Binary files /dev/null and b/docs/assets/figures/Workflow_Relationships_dark.png differ diff --git a/docs/assets/figures/basespace_fetch/step1.png b/docs/assets/figures/basespace_fetch/step1.png new file mode 100644 index 000000000..0cb74c400 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step1.png differ diff --git a/docs/assets/figures/basespace_fetch/step10.png b/docs/assets/figures/basespace_fetch/step10.png new file mode 100644 index 000000000..2674e2e86 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step10.png differ diff --git a/docs/assets/figures/basespace_fetch/step11.png b/docs/assets/figures/basespace_fetch/step11.png new file mode 100644 index 000000000..2964add37 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step11.png differ diff --git a/docs/assets/figures/basespace_fetch/step2.png b/docs/assets/figures/basespace_fetch/step2.png new file mode 100644 index 000000000..452ae0e06 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step2.png differ diff --git a/docs/assets/figures/basespace_fetch/step3.png b/docs/assets/figures/basespace_fetch/step3.png new file mode 100644 index 000000000..dc110090d Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step3.png differ diff --git a/docs/assets/figures/basespace_fetch/step4.png b/docs/assets/figures/basespace_fetch/step4.png new file mode 100644 index 000000000..80166b949 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step4.png differ diff --git a/docs/assets/figures/basespace_fetch/step5.png b/docs/assets/figures/basespace_fetch/step5.png new file mode 100644 index 000000000..5da5b66df Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step5.png differ diff --git a/docs/assets/figures/basespace_fetch/step6.png b/docs/assets/figures/basespace_fetch/step6.png new file mode 100644 index 000000000..2f71b04a2 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step6.png differ diff --git a/docs/assets/figures/basespace_fetch/step7.png b/docs/assets/figures/basespace_fetch/step7.png new file mode 100644 index 000000000..9122a5659 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step7.png differ diff --git a/docs/assets/figures/basespace_fetch/step8.png b/docs/assets/figures/basespace_fetch/step8.png new file mode 100644 index 000000000..d24d56124 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step8.png differ diff --git a/docs/assets/figures/basespace_fetch/step9.png b/docs/assets/figures/basespace_fetch/step9.png new file mode 100644 index 000000000..c347cc382 Binary files /dev/null and b/docs/assets/figures/basespace_fetch/step9.png differ diff --git a/docs/assets/figures/example_krona_report.png b/docs/assets/figures/example_krona_report.png new file mode 100644 index 000000000..9d48ce9a7 Binary files /dev/null and b/docs/assets/figures/example_krona_report.png differ diff --git a/docs/assets/figures/example_phandango_coloring.png b/docs/assets/figures/example_phandango_coloring.png new file mode 100644 index 000000000..a82b9c99d Binary files /dev/null and b/docs/assets/figures/example_phandango_coloring.png differ diff --git a/docs/assets/files/GPSC_README_PopPUNK2.txt b/docs/assets/files/GPSC_README_PopPUNK2.txt new file mode 100644 index 000000000..68b6126d2 --- /dev/null +++ b/docs/assets/files/GPSC_README_PopPUNK2.txt @@ -0,0 +1,37 @@ +GPSC assignment + +Install PopPUNK 2.4 as per instructions at PopPUNK documentation and download +the GPS reference database and the GPS designation. + +Files required to run GPSC assignment using PopPUNK 2.4: +1. A 2-column tab-delimited file containing sample name and path to the + corresponding assembly (no header) +2. GPS reference database +3. GPS designation + +output directory name is assigned using --output +number of threads can be changed using –threads + +Run GPSC assignment: + +poppunk_assign --db GPS_v6 \ + --distances GPS_v6/GPS_v6.dists \ + --query <2-column path to assembly> \ + --output \ + --external-clustering \ + GPS_v6_external_clusters.csv + +Outputs: + +_clusters.csv: popPUNK clusters with dataset specific nomenclature +_external_clusters.csv: GPSC v6 scheme designations + +Novel Clusters are assigned NA in the _external_clusters.csv as they have +not been defined in the v6 dataset used to designate the GPSCs. Please email: +globalpneumoseq@gmail.com to have novel clusters added to the database and a +GPSC cluster name assigned after you have checked for low level contamination +which may contribute to biased accessory distances. + +Merged clusters: Unsampled diversity may represent missing variation linking two +clusters. GPSCs are then merged. For example if GPSC23 and GPSC362 merged, the +GPSC would be then reported as GPSC23, with a merge history of GPSC23;362. diff --git a/docs/assets/files/TheiaEuk_qc_check_template.tsv b/docs/assets/files/TheiaEuk_qc_check_template.tsv new file mode 100644 index 000000000..246355c36 --- /dev/null +++ b/docs/assets/files/TheiaEuk_qc_check_template.tsv @@ -0,0 +1,18 @@ +taxon num_reads_raw1 num_reads_raw2 num_reads_clean1 num_reads_clean2 est_coverage_raw est_coverage_clean combined_mean_q_raw r1_mean_q_raw r2_mean_q_raw combined_mean_readlength_raw r1_mean_readlength_raw r2_mean_readlength_raw combined_mean_q_clean r1_mean_q_clean r2_mean_q_clean combined_mean_readlength_clean r1_mean_readlength_clean r2_mean_readlength_clean assembly_length_min assembly_length_max number_contigs n50_value quast_gc_percent_min quast_gc_percent_max busco_completeness +Aspergillus_terreus 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 27960000 31840000 1000 10000 95 +Aspergillus_flavus 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 35090000 42930000 1000 10000 95 +Aspergillus_fumigatus 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 27360000 33400000 1000 10000 95 +Candida_albicans 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 11650000 18700000 1000 10000 95 +Candida_auris 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 12100000 14250000 1000 10000 43 47 95 +Candida_glabrata 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 11810000 14520000 1000 10000 95 +Candida_parapsilosis 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 11570000 13330000 1000 10000 95 +Candida_tropicalis 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 14350000 15730000 1000 10000 95 +Clavispora_lusitaniae 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 11910000 12380000 1000 10000 95 +Coccidioides_immitis 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 27470000 28950000 1000 10000 95 +Cryptococcus_gattii_VGI 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 17230000 18370000 1000 10000 95 +Cryptococcus_neoformans 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 17990000 19750000 1000 10000 95 +Fusarium 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 26090000 72920000 1000 10000 95 +Kluyveromyces_marxianus 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 9555000 13340000 1000 10000 95 +Pichia_kudriavzevii 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 72470000 12940000 1000 10000 95 +Yarrowia_lipolytica 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 19720000 21250000 1000 10000 95 +Candida_haemuloni 1000000 1000000 1000000 1000000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 12590000 13310000 1000 10000 95 diff --git a/docs/assets/files/TheiaProk_FASTA_qc_check_template.tsv b/docs/assets/files/TheiaProk_FASTA_qc_check_template.tsv new file mode 100644 index 000000000..cd9a8d3b7 --- /dev/null +++ b/docs/assets/files/TheiaProk_FASTA_qc_check_template.tsv @@ -0,0 +1,15 @@ +taxon assembly_length_min assembly_length_max number_contigs n50_value quast_gc_percent_min quast_gc_percent_max busco_completeness ani_highest_percent ani_highest_percent_bases_aligned +Listeria_monocytogenes 2800000 3200000 500 10000 95 92 70 +Escherichia_coli 4900000 6000000 500 10000 95 92 70 +Shigella 4200000 4900000 500 10000 95 92 70 +Salmonella 4400000 5700000 500 10000 95 92 70 +Campylobacter 1400000 2200000 500 10000 95 92 70 +Vibrio_cholerae 3800000 4300000 500 10000 95 92 70 +Vibrio_parahaemolyticus 4900000 5500000 500 10000 95 92 70 +Vibrio_vulnificus 4700000 5300000 500 10000 95 92 70 +Pseudomonas 500 10000 95 +Streptococcus 500 10000 95 +Acinetobacter 500 10000 95 +Mycobacterium 500 10000 95 +Legionella 500 10000 95 +Klebsiella 500 10000 95 diff --git a/docs/assets/files/TheiaProk_Illumina_PE_qc_check_template.tsv b/docs/assets/files/TheiaProk_Illumina_PE_qc_check_template.tsv new file mode 100644 index 000000000..10b3928dc --- /dev/null +++ b/docs/assets/files/TheiaProk_Illumina_PE_qc_check_template.tsv @@ -0,0 +1,15 @@ +taxon num_reads_raw1 num_reads_raw2 num_reads_clean1 num_reads_clean2 est_coverage_raw est_coverage_clean combined_mean_q_raw r1_mean_q_raw r2_mean_q_raw combined_mean_readlength_raw r1_mean_readlength_raw r2_mean_readlength_raw combined_mean_q_clean r1_mean_q_clean r2_mean_q_clean combined_mean_readlength_clean r1_mean_readlength_clean r2_mean_readlength_clean assembly_length_min assembly_length_max midas_secondary_genus_abundance number_contigs n50_value quast_gc_percent_min quast_gc_percent_max busco_completeness ani_highest_percent ani_highest_percent_bases_aligned +Listeria_monocytogenes 100000 100000 100000 100000 20 30 135 30 135 2800000 3200000 0.01 92 70 +Escherichia_coli 100000 100000 100000 100000 40 30 135 30 135 4900000 6000000 0.01 92 70 +Shigella 100000 100000 100000 100000 40 30 135 30 135 4200000 4900000 0.01 92 70 +Salmonella 100000 100000 100000 100000 30 30 135 30 135 4400000 5700000 0.01 92 70 +Campylobacter 100000 100000 100000 100000 20 30 135 30 135 1400000 2200000 0.01 92 70 +Vibrio_cholerae 100000 100000 100000 100000 40 30 135 30 135 3800000 4300000 0.01 92 70 +Vibrio_parahaemolyticus 100000 100000 100000 100000 40 30 135 30 135 4900000 5500000 0.01 92 70 +Vibrio_vulnificus 100000 100000 100000 100000 40 30 135 30 135 4700000 5300000 0.01 92 70 +Pseudomonas 100000 100000 100000 100000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 500 10000 95 +Streptococcus 100000 100000 100000 100000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 500 10000 95 +Acinetobacter 100000 100000 100000 100000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 500 10000 95 +Mycobacterium 100000 100000 100000 100000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 500 10000 95 +Legionella 100000 100000 100000 100000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 500 10000 95 +Klebsiella 100000 100000 100000 100000 30 30 30 30 30 135 135 135 30 30 30 135 135 135 500 10000 95 diff --git a/docs/assets/files/example_excluded_samples.tsv b/docs/assets/files/example_excluded_samples.tsv new file mode 100644 index 000000000..bddf9dfb0 --- /dev/null +++ b/docs/assets/files/example_excluded_samples.tsv @@ -0,0 +1,10 @@ +Samples excluded for quality thresholds: +sample_name message +sample2 VADR skipped due to poor assembly +sample3 VADR number alerts too high: 3 greater than limit of 0 +sample4 Number of Ns was too high: 10000 greater than limit of 5000 + +Samples excluded for missing required metadata (will have empty values in indicated columns): +tablename_id organism country library_layout +sample5 paired +sample6 SARS-CoV-2 USA diff --git a/docs/assets/files/theiavalidate/example_exact_differences.tsv b/docs/assets/files/theiavalidate/example_exact_differences.tsv new file mode 100644 index 000000000..eec432560 --- /dev/null +++ b/docs/assets/files/theiavalidate/example_exact_differences.tsv @@ -0,0 +1,8 @@ + columnA-string columnA-string columnB-set columnB-set columnC-ignore columnC-ignore columnD-float columnD-float columnE-missing columnE-missing + example_table1.tsv example_table2.tsv example_table1.tsv example_table2.tsv example_table1.tsv example_table2.tsv example_table1.tsv example_table2.tsv example_table1.tsv example_table2.tsv +samples +sample1 item1,item2,item3 item1,item3,item2 cheese cheesecake 1000.0 999.0 +sample2 option1 option2 item1,item3,item2 item1,item2,item3 cheesecake batter +sample3 option2 option1 item1,item2,item3 item1,item2 cake cheese 14.0 24.0 present +sample4 item2,item1 item1,item2 3492.0 728.0 +sample5 item1,item2 item1,item2,item3 3.0 4.0 diff --git a/docs/assets/files/theiavalidate/example_summary.pdf b/docs/assets/files/theiavalidate/example_summary.pdf new file mode 100644 index 000000000..2a2cfcea5 Binary files /dev/null and b/docs/assets/files/theiavalidate/example_summary.pdf differ diff --git a/docs/assets/files/theiavalidate/example_validation_criteria_differences.tsv b/docs/assets/files/theiavalidate/example_validation_criteria_differences.tsv new file mode 100644 index 000000000..eec8f23b0 --- /dev/null +++ b/docs/assets/files/theiavalidate/example_validation_criteria_differences.tsv @@ -0,0 +1,7 @@ +Column columnB-set columnB-set columnD-float columnD-float columnE-missing columnE-missing +Table example_table1.tsv example_table2.tsv example_table1.tsv example_table2.tsv example_table1.tsv example_table2.tsv +sample1 +sample2 +sample3 item1,item2,item3 item1,item2 14.0 24.0 present +sample4 3492.0 728.0 +sample5 item1,item2 item1,item2,item3 3.0 4.0 diff --git a/docs/assets/files/theiavalidate/filtered_example_table1.tsv b/docs/assets/files/theiavalidate/filtered_example_table1.tsv new file mode 100644 index 000000000..db8e7d067 --- /dev/null +++ b/docs/assets/files/theiavalidate/filtered_example_table1.tsv @@ -0,0 +1,6 @@ +samples columnA-string columnB-set columnC-ignore columnD-float columnE-missing +sample1 option1 item1,item2,item3 cheese 1000 present +sample2 option1 item1,item3,item2 cheesecake 12 present +sample3 option2 item1,item2,item3 cake 14 present +sample4 option1 item2,item1 cakebatter 3492 +sample5 option2 item1,item2 batter 3 present diff --git a/docs/assets/files/theiavalidate/filtered_example_table2.tsv b/docs/assets/files/theiavalidate/filtered_example_table2.tsv new file mode 100644 index 000000000..fed068ce2 --- /dev/null +++ b/docs/assets/files/theiavalidate/filtered_example_table2.tsv @@ -0,0 +1,6 @@ +samples columnA-string columnB-set columnC-ignore columnD-float columnE-missing +sample1 option1 item1,item3,item2 cheesecake 999 present +sample2 option2 item1,item2,item3 batter 12 present +sample3 option1 item1,item2 cheese 24 +sample4 option1 item1,item2 cakebatter 728 +sample5 option2 item1,item2,item3 batter 4 present diff --git a/docs/assets/files/theiavalidate/theiacov-validation-criteria.txt b/docs/assets/files/theiavalidate/theiacov-validation-criteria.txt new file mode 100644 index 000000000..504222f3f --- /dev/null +++ b/docs/assets/files/theiavalidate/theiacov-validation-criteria.txt @@ -0,0 +1,24 @@ +column criteria +abricate_flu_subtype EXACT +abricate_flu_type EXACT +assembly_length_unambiguous 0.01 +assembly_mean_coverage 0.01 +irma_subtype EXACT +irma_type EXACT +kraken_human EXACT +kraken_human_dehosted EXACT +kraken_sc2 EXACT +kraken_sc2_dehosted EXACT +kraken_target_org EXACT +kraken_target_org_dehosted EXACT +nextclade_aa_dels SET +nextclade_aa_subs SET +nextclade_clade EXACT +nextclade_lineage EXACT +nextclade_tamiflu_resistance_aa_subs SET +num_reads_clean1 EXACT +num_reads_clean2 EXACT +number_N 0.01 +pango_lineage EXACT +percent_reference_coverage 0.01 +vadr_num_alerts EXACT diff --git a/docs/assets/files/theiavalidate/theiaeuk-validation-criteria.txt b/docs/assets/files/theiavalidate/theiaeuk-validation-criteria.txt new file mode 100644 index 000000000..28237910b --- /dev/null +++ b/docs/assets/files/theiavalidate/theiaeuk-validation-criteria.txt @@ -0,0 +1,13 @@ +column criteria +assembly_length 0.01 +busco_results EXACT +clade_type EXACT +est_coverage_clean 0.01 +est_coverage_raw 0.01 +gambit_predicted_taxon EXACT +n50_value 0.01 +num_reads_clean1 EXACT +num_reads_clean2 EXACT +number_contigs 0.01 +quast_gc_percent 0.01 +theiaeuk_snippy_variants_hits SET diff --git a/docs/assets/files/theiavalidate/theiaprok-validation-criteria.txt b/docs/assets/files/theiavalidate/theiaprok-validation-criteria.txt new file mode 100644 index 000000000..3f1e8f11a --- /dev/null +++ b/docs/assets/files/theiavalidate/theiaprok-validation-criteria.txt @@ -0,0 +1,56 @@ +column criteria +abricate_abaum_plasmid_type_genes SET +agrvate_agr_group EXACT +amrfinderplus_amr_core_genes SET +amrfinderplus_amr_plus_genes SET +amrfinderplus_stress_genes SET +amrfinderplus_virulence_genes SET +ani_highest_percent 0.01 +ani_top_species_match EXACT +assembly_length 0.01 +busco_results EXACT +ectyper_predicted_serotype EXACT +emmtypingtool_emm_type EXACT +est_coverage_clean 0.01 +est_coverage_raw 0.01 +gambit_predicted_taxon EXACT +genotyphi_final_genotype EXACT +hicap_genes SET +hicap_serotype EXACT +kaptive_k_type EXACT +kleborate_genomic_resistance_mutations SET +kleborate_key_resistance_genes SET +kleborate_mlst_sequence_type EXACT +legsta_predicted_sbt EXACT +lissero_serotype SET +meningotype_serogroup EXACT +midas_primary_genus EXACT +midas_secondary_genus EXACT +midas_secondary_genus_abundance 0.01 +n50_value 0.01 +ngmaster_ngmast_sequence_type EXACT +ngmaster_ngstar_sequence_type EXACT +num_reads_clean1 EXACT +num_reads_clean2 EXACT +number_contigs 0.01 +pasty_serogroup EXACT +pbptyper_predicted_1A_2B_2X EXACT +plasmidfinder_plasmids SET +poppunk_gps_cluster EXACT +seqsero2_predicted_serotype EXACT +seroba_ariba_serotype EXACT +seroba_serotype EXACT +serotypefinder_serotype EXACT +shigatyper_ipaB_presence_absence EXACT +shigatyper_predicted_serotype EXACT +shigeifinder_cluster EXACT +shigeifinder_serotype EXACT +sistr_predicted_serotype EXACT +sonneityping_final_genotype EXACT +spatyper_type EXACT +srst2_vibrio_serogroup EXACT +staphopiasccmec_types_and_mecA_presence EXACT +tbprofiler_main_lineage EXACT +tbprofiler_resistance_genes SET +ts_mlst_predicted_st EXACT +virulencefinder_hits EXACT \ No newline at end of file diff --git a/docs/assets/logos/Theiagen-Logo-White.png b/docs/assets/logos/Theiagen-Logo-White.png new file mode 100644 index 000000000..a30ce5c7b Binary files /dev/null and b/docs/assets/logos/Theiagen-Logo-White.png differ diff --git a/docs/assets/logos/Theiagen-Symbol-Standard-01.png b/docs/assets/logos/Theiagen-Symbol-Standard-01.png new file mode 100644 index 000000000..77611799e Binary files /dev/null and b/docs/assets/logos/Theiagen-Symbol-Standard-01.png differ diff --git a/docs/assets/metadata_formatters/Mercury_Prep_N_Batch_MPXV_Metadata_Formatter_2022_12_23.xlsx b/docs/assets/metadata_formatters/Mercury_Prep_N_Batch_MPXV_Metadata_Formatter_2022_12_23.xlsx new file mode 100644 index 000000000..67cf24dcb Binary files /dev/null and b/docs/assets/metadata_formatters/Mercury_Prep_N_Batch_MPXV_Metadata_Formatter_2022_12_23.xlsx differ diff --git a/docs/assets/metadata_formatters/Mercury_Prep_N_Batch_SC2_Metadata_Formatter_2023_05_22.xlsx b/docs/assets/metadata_formatters/Mercury_Prep_N_Batch_SC2_Metadata_Formatter_2023_05_22.xlsx new file mode 100644 index 000000000..b998c6a9d Binary files /dev/null and b/docs/assets/metadata_formatters/Mercury_Prep_N_Batch_SC2_Metadata_Formatter_2023_05_22.xlsx differ diff --git a/docs/assets/metadata_formatters/Terra_2_NCBI-MICROBE-metadata-2022-07-11.xlsx b/docs/assets/metadata_formatters/Terra_2_NCBI-MICROBE-metadata-2022-07-11.xlsx new file mode 100644 index 000000000..acf92a08f Binary files /dev/null and b/docs/assets/metadata_formatters/Terra_2_NCBI-MICROBE-metadata-2022-07-11.xlsx differ diff --git a/docs/assets/metadata_formatters/Terra_2_NCBI-PATHOGEN-metadata-2024-04-30.xlsx b/docs/assets/metadata_formatters/Terra_2_NCBI-PATHOGEN-metadata-2024-04-30.xlsx new file mode 100644 index 000000000..9c07ce4cb Binary files /dev/null and b/docs/assets/metadata_formatters/Terra_2_NCBI-PATHOGEN-metadata-2024-04-30.xlsx differ diff --git a/docs/assets/metadata_formatters/Terra_2_NCBI-VIRUS-metadata-2022-09-09.xlsx b/docs/assets/metadata_formatters/Terra_2_NCBI-VIRUS-metadata-2022-09-09.xlsx new file mode 100644 index 000000000..be006fa90 Binary files /dev/null and b/docs/assets/metadata_formatters/Terra_2_NCBI-VIRUS-metadata-2022-09-09.xlsx differ diff --git a/docs/assets/new_workflow_template.md b/docs/assets/new_workflow_template.md new file mode 100644 index 000000000..9e7ef6799 --- /dev/null +++ b/docs/assets/new_workflow_template.md @@ -0,0 +1,44 @@ +# Workflow Name + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Workflow Type](../../workflows_overview/workflows_type.md/#link-to-workflow-type) | [Applicable Kingdom](../../workflows_overview/workflows_kingdom.md/#link-to-applicable-kingdom) | PHB | | | + +## Workflow_Name_On_Terra + +Description of the workflow. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| task_name | **variable_name** | Type | Description | Default Value | Required/Optional | + +### Workflow Tasks + +Description of the workflow tasks + +??? task "`tool_name`: Description of tool" + Description of the task + + !!! techdetails "Tool Name Technical Details" + | | Links | + | --- | --- | + | Task | [link to task on GitHub] | + | Software Source Code | [link to tool's source code] | + | Software Documentation | [link to tool's documentation] | + | Original Publication | [link to tool's publication] | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| variable_name | Type | Description | + +## References (if applicable) + +> reference1 + +> reference2 diff --git a/docs/assets/sops/TG-AUGUR-01_Augur_v1.pdf b/docs/assets/sops/TG-AUGUR-01_Augur_v1.pdf new file mode 100644 index 000000000..e13a3d48d Binary files /dev/null and b/docs/assets/sops/TG-AUGUR-01_Augur_v1.pdf differ diff --git a/docs/assets/sops/TG-FLU-PE_SOP_Flu_IlluminaPE_v1.pdf b/docs/assets/sops/TG-FLU-PE_SOP_Flu_IlluminaPE_v1.pdf new file mode 100644 index 000000000..d0832b78c Binary files /dev/null and b/docs/assets/sops/TG-FLU-PE_SOP_Flu_IlluminaPE_v1.pdf differ diff --git a/docs/assets/sops/TG-FREY-01_FreyjaFASTQ_v2.pdf b/docs/assets/sops/TG-FREY-01_FreyjaFASTQ_v2.pdf new file mode 100644 index 000000000..929da20fd Binary files /dev/null and b/docs/assets/sops/TG-FREY-01_FreyjaFASTQ_v2.pdf differ diff --git a/docs/assets/sops/TG-FREY-02_FreyjaPlot_v3.pdf b/docs/assets/sops/TG-FREY-02_FreyjaPlot_v3.pdf new file mode 100644 index 000000000..8f6dae922 Binary files /dev/null and b/docs/assets/sops/TG-FREY-02_FreyjaPlot_v3.pdf differ diff --git a/docs/assets/sops/TG-FREY-03-SOP_FreyjaDashboard_v2.pdf b/docs/assets/sops/TG-FREY-03-SOP_FreyjaDashboard_v2.pdf new file mode 100644 index 000000000..6e9f8a83c Binary files /dev/null and b/docs/assets/sops/TG-FREY-03-SOP_FreyjaDashboard_v2.pdf differ diff --git a/docs/assets/sops/TG-FREY-04_FreyjaUpdate_v2.pdf b/docs/assets/sops/TG-FREY-04_FreyjaUpdate_v2.pdf new file mode 100644 index 000000000..b294f8a1f Binary files /dev/null and b/docs/assets/sops/TG-FREY-04_FreyjaUpdate_v2.pdf differ diff --git a/docs/assets/sops/TG-GISAID-01_Terra2GISAID_v2.pdf b/docs/assets/sops/TG-GISAID-01_Terra2GISAID_v2.pdf new file mode 100644 index 000000000..28900cb98 Binary files /dev/null and b/docs/assets/sops/TG-GISAID-01_Terra2GISAID_v2.pdf differ diff --git a/docs/assets/sops/TG-SC2-CL_SC2_ClearLabs_v3.pdf b/docs/assets/sops/TG-SC2-CL_SC2_ClearLabs_v3.pdf new file mode 100644 index 000000000..27aa97732 Binary files /dev/null and b/docs/assets/sops/TG-SC2-CL_SC2_ClearLabs_v3.pdf differ diff --git a/docs/assets/sops/TG-SC2-FST_Analyzing_SC2_Using_TheiaCoV_FASTA_PHB_v2.pdf b/docs/assets/sops/TG-SC2-FST_Analyzing_SC2_Using_TheiaCoV_FASTA_PHB_v2.pdf new file mode 100644 index 000000000..f39fa83c6 Binary files /dev/null and b/docs/assets/sops/TG-SC2-FST_Analyzing_SC2_Using_TheiaCoV_FASTA_PHB_v2.pdf differ diff --git a/docs/assets/sops/TG-SC2-ONT_Analyzing_SC2_Using_TheiaCov_ONT_PHB_v2.pdf b/docs/assets/sops/TG-SC2-ONT_Analyzing_SC2_Using_TheiaCov_ONT_PHB_v2.pdf new file mode 100644 index 000000000..6913f12f5 Binary files /dev/null and b/docs/assets/sops/TG-SC2-ONT_Analyzing_SC2_Using_TheiaCov_ONT_PHB_v2.pdf differ diff --git a/docs/assets/sops/TG-SC2-PE_SC2_TheiaCoV_IlluminaPE_v3.pdf b/docs/assets/sops/TG-SC2-PE_SC2_TheiaCoV_IlluminaPE_v3.pdf new file mode 100644 index 000000000..6e81bded2 Binary files /dev/null and b/docs/assets/sops/TG-SC2-PE_SC2_TheiaCoV_IlluminaPE_v3.pdf differ diff --git a/docs/assets/sops/TG-SC2-SE_SC2_TheiaCoV_IlluminaSE_v3.pdf b/docs/assets/sops/TG-SC2-SE_SC2_TheiaCoV_IlluminaSE_v3.pdf new file mode 100644 index 000000000..dd0c081a3 Binary files /dev/null and b/docs/assets/sops/TG-SC2-SE_SC2_TheiaCoV_IlluminaSE_v3.pdf differ diff --git a/docs/assets/sops/TG-TER-03_GettingStartedInTerra_v3.pdf b/docs/assets/sops/TG-TER-03_GettingStartedInTerra_v3.pdf new file mode 100644 index 000000000..851a922e9 Binary files /dev/null and b/docs/assets/sops/TG-TER-03_GettingStartedInTerra_v3.pdf differ diff --git a/docs/assets/sops/TG-TER-04_BaseSpaceFetch_v2.pdf b/docs/assets/sops/TG-TER-04_BaseSpaceFetch_v2.pdf new file mode 100644 index 000000000..465043cb3 Binary files /dev/null and b/docs/assets/sops/TG-TER-04_BaseSpaceFetch_v2.pdf differ diff --git a/docs/contributing/code_contribution.md b/docs/contributing/code_contribution.md new file mode 100644 index 000000000..cb7ba5727 --- /dev/null +++ b/docs/contributing/code_contribution.md @@ -0,0 +1,283 @@ +# PHB Code Contributions + +Theiagen Genomics’ [**Public Health Bioinformatics (PHB)**](https://github.com/theiagen/public_health_bioinformatics) workflows are written in [WDL](https://github.com/openwdl/wdl), a language for specifying data processing workflows with a human-readable and writable syntax. Contributions to the workflows contained in the repository are warmly welcomed. + +This document gives coding conventions for the WDL code comprising the workflow and task development for PHB. This style guide evolves over time as additional conventions are identified and past conventions are rendered obsolete by changes in the language itself. + +Style guide inspired by Scott Frazer’s [WDL Best Practices Style Guide](https://gist.github.com/scottfrazer/aa4ab1945a6a4c331211). + +## General Guidelines + +- Put tasks and workflows in separate files in the appropriate folders. +- Always add a description as metadata + + ```bash + meta { + description: "This tool does X" + } + ``` + +- Ensure that the docker container is locked to a given version, not `latest` + + ```bash + String docker = "quay.io/docker_image:version" + ``` + +- Preferentially use containers [`Google's Artifact Registry`](https://console.cloud.google.com/artifacts/docker/general-theiagen/us) rather than those from [`quay.io`](http://quay.io) or [`dockerhub`](https://hub.docker.com/) +- Use 2-space indents (no tabs) + + ```bash + # perform action + if [ this ]; then + action1(variable) + fi + ``` + +- Do not use line break for opening braces +- Use single space when defining input/output variables & runtime attributes (`output {` instead of `output{`) +- Use single-line breaks between non-intended constructs +- Enclose task commands with triple angle brackets (`<<< ... >>>`) +- Consistently use white space with variables (`this = that` *not* `this= that` (unless a bash variable where `this=that` is required)) + +## Task Blocks + +The task should contain the following sections. Include _single_ spaces between input, command, output, and runtime closing and opening curly brackets. + +```bash +input { + +} +command <<< + +>>> +output { + +} +runtime { + +} +``` + +??? toggle "`input` block" + - The following conventions are used to expose docker, CPU, memory, and disk size + + ```bash + input { + String docker = "..." + Int cpu = x + Int memory = y + Int disk_size = z + } + ``` + + - If additional arguments should be allowed to be passed to the task, this input should follow the convention below: + + ```bash + input { + String args = "" + } + ``` + + - Input and output lists should not be formatted to have the equal sign aligned, but instead use a single space before and after the `=` + + ```bash + output1_x = string1 + output2_that_does_y = string2 + ``` + + - Ensure the docker container is exposed as an input and as an output string + + ```bash + input { + String docker = "" + } + ... + output { + String XX_docker = docker + } + runtime { + docker: docker + } + ``` + +??? toggle "`command` block" + - Ensure use of line breaks between different sections of code to improve readability + + ```bash + # if this, perform action 1 + if [ this ]; then + action1(variable) + fi + + # if that, perform action 2 + if [ that ]; then + action2(variable) + fi + ``` + + - Split command calls into multiple lines if they have user input variables and/or if the length of the command is very long to avoid text wrapping and/or side-scrolling, e.g. + - Use indentation as appropriate + + ```bash + tool \ + --option1 ~{option1} \ + --option2 ~{option2} \ + ... + --option999 ~{option999} + ``` + + - Add comments that + - Explain what the optional parameters are + - Provide links to the tool documentation so future readers of the code know where to find that information + - Explain what non-intuitive bash/python text wrangling actions do, e.g. + + ```bash + ## awk for gene column ($6) to grab subtype ($15) + cat ~{file} | awk -F '\t' '{if ($6=="M1") print $15}' > FLU_TYPE + ``` + +??? toggle "`output` block" + - File types should be clearly stated in the output name variables + + ```bash + output1_csv = file1.csv + output2_tsv = file2.tsv + ``` + + - Ensure the docker container is exposed as an output string, e.g. + + ```bash + input { + String docker + } + ... + output { + String XX_docker = docker + } + runtime { + docker: docker + } + ``` + +??? toggle "`runtime` block" + - Always use a docker container + +## Workflow Blocks + +The workflow/sub-workflow file should contain: + +- a block of `import` statements (alphabetical order), + - When a workflow imports a task, make sure that it is imported under a different name than the task it is calling +- a `workflow` block with + - an `input` section + - `call` sections for specified tasks + - an `output` section + +Example formatting is shown below. + +??? toggle "wf_example_wf.wdl" + + ```bash + import "../tasks/task_task1.wdl" as task1_task + import "../tasks/task_task2.wdl" as task2_task + + import "../workflows/wf_subworkflow.wdl" as subworkflow + + workflow example_wf { + input { + String input + String task1_docker = "us-docker.pkg.dev/general-theiagen/task_1:version" + String task2_docker = "us-docker.pkg.dev/general-theiagen//task_2:version" + String? hidden_task3_argument + String? hidden_task3_docker + String? hidden_task4_argument + String? hidden_task4_docker + } + call task1_task.task1 { + input: + input = input, + docker = task1_docker + } + call task2_task.task2 { + input: + input = input, + docker = task2_docker + } + call subworkflow.subworkflow { + input: + input = input + } + output { + # Task 1 outputs + File task1_out_csv = task1.output_csv + String task1_version = task1.version + String task1_docker = task1.docker + # Task 2 outputs + File task2_out_tsv = task2.output_tsv + String task2_version = task2.version + String task2_docker = task2.docker + # Subworkflow outputs + File task3_out_tsv = subworkflow.task3_out_tsv + String task3_version = subworkflow.task3_version + String task3_docker = subworkflow.task3_docker + } + } + ``` + + +??? toggle "wf_subworkflow.wdl" + ```bash + import "../tasks/task_task3.wdl" as task3_task + import "../tasks/task_task4.wdl" as task4_task + + workflow subworkflow { + input { + String input + + # optional inputs for tasks inside subworkflows cannot + # be seen on Terra, so make them available at the subworkflow + # level so they can be modified by a Terra user + String? task3_argument + String? task3_docker + } + call task3_task.task3 { + input: + input = input, + args = task3_argument, + docker = task3_docker + } + output { + File task3_out_tsv = task3.output_tsv + String task3_version = task3.version + String task3_docker = task3.docker + } + } + ``` + +--- + +??? toggle "`input` section" + - Optional inputs that should be able to be edited by the user, such as docker containers should be exposed on the workflow level as in the example + - In the case of subworkflows, all optional inputs should be exposed on the workflow level so that they can be modified by users on Terra + +??? toggle "`call` task sections" + - There should be no blank lines between tasks in workflows + + ```bash + task A { + } + task B { + } + ``` + + - Label a group of outputs by the source/species for organizational purposes when a workflow has many different outputs + + ```ebnf + output { + ... + # task99 outputs + String task99_ouput + String task99_file + ... + } + ``` diff --git a/docs/contributing/doc_contribution.md b/docs/contributing/doc_contribution.md new file mode 100644 index 000000000..8a32abf56 --- /dev/null +++ b/docs/contributing/doc_contribution.md @@ -0,0 +1,161 @@ +# PHB Documentation Contribution Guide + +The documentation for PHB is hosted in the `docs/` directory. This documentation is written in Markdown and is built using [MkDocs](https://www.mkdocs.org/) and the [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/) theme. + +This guide is intended to provide a brief overview of the documentation structure and how to contribute to the documentation, including standard language and formatting conventions. + +## Local Installation & Live Previews + +Since the documentation is built off of the `main` branch, it is highly recommended to preview your changes before making a PR. You can do this by installing the necessary packages and previewing ("serving") the documentation locally. + +To test your documentation changes, you will need to have the following packages installed on your local VM: + +```bash +pip install mkdocs-material mkdocs-material-extensions mkdocs-git-revision-date-localized-plugin mike mkdocs-glightbox +``` + +The live preview server can be activated by running the following command: + +```bash +mkdocs serve +``` + +This will prompt you to open your browser to the appropriate local host address (by default, localhost:8000). Every time you save a change, the documentation will automatically update in the browser. + +### VSCode Extensions + +Here are some VSCode Extensions can help you write and edit your markdown files (and allow you preview changes without running the server, though formatting will suffer): + +- [Markdown Preview Enhanced (Yiyi Wang)](https://marketplace.visualstudio.com/items?itemName=shd101wyy.markdown-preview-enhanced) - This extension is good for previewing markdown files in VSCode, but is **not** good at rendering any of the more advanced features such as callouts or tables. +- [Markdown All in One (Yu Zhang)](https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one) - This extension allows you to use regular word-processing short-cuts to format your markdown files, like Ctrl-B to bold text, Ctrl-I for italics without having to manually type the `**` or `_` characters. +- [markdownlint (David Anson)](https://marketplace.visualstudio.com/items?itemName=DavidAnson.vscode-markdownlint) - This extension will help you catch any formatting errors in your markdown files. + +### Helpful Websites + +- [Excel to Markdown Table](https://tableconvert.com/excel-to-markdown) - This website will convert an Excel table into markdown format, which can be copied and pasted into your markdown file. +- [Material for MkDocs Reference](https://squidfunk.github.io/mkdocs-material/reference/) - This is the official reference for the Material for MkDocs theme, which will help you understand how to use the theme's features. + +## Documentation Structure + +A brief description of the documentation structure is as follows: + +- `docs/` - Contains the Markdown files for the documentation. + - `assets/` - Contains images and other files used in the documentation. + - `figures/` - Contains images, figures, and workflow diagrams used in the documentation. For workflows that contain many images (such as BaseSpace_Fetch), it is recommended to create a subdirectory for the workflow. + - `files/` - Contains files that are used in the documentation. This may include example outputs or templates. For workflows that contain many files (such as TheiaValidate), it is recommended to create a subdirectory for the workflow. + - `logos/` - Contains Theiagen logos and symbols used int he documentation. + - `metadata_formatters/` - Contains the most up-to-date metadata formatters for our submission workflows. + - `new_workflow_template.md` - A template for adding a new workflow page to the documentation. + - `contributing/` - Contains the Markdown files for our contribution guides, such as this file + - `javascripts/` - Contains JavaScript files used in the documentation. + - `tablesort.js` - A JavaScript file used to enable table sorting in the documentation. + - `overrides/` - Contains HTMLs used to override theme defaults + - `main.html` - Contains the HTML used to display a warning when the latest version is not selected + - `stylesheets/` - Contains CSS files used in the documentation. + - `extra.css` - A custom CSS file used to style the documentation; contains all custom theme elements (scrollable tables, resizable columns, Theiagen colors), and custom admonitions. + - `workflows/` - Contains the Markdown files for each workflow, organized into subdirectories by workflow category + - `workflows_overview/` - Contains the Markdown files for the overview tables for each display type: alphabetically, by applicable kingdom, and by workflow type. + - `index.md` - The home/landing page for our documentation. + +### Adding a Page for a New Workflow {#new-page} + +If you are adding a new workflow, there are a number of things to do in order to include the page in the documentation: + +1. Add a page with the title of the workflow to appropriate subdirectory in `docs/workflows/`. Feel free to use the template found in the `assets/` folder. +2. Collect the following information for your new workflow: + - Workflow Name - Link the name with a relative path to the workflow page in appropriate `docs/workflows/` subdirectory + - Workflow Description - Brief description of the workflow + - Applicable Kingdom - Options: "Any taxa", "Bacteria", "Mycotics", "Viral" + - Workflow Level (_on Terra_) - Options: "Sample-level", "Set-level", or neither + - Command-line compatibility - Options: "Yes", "No", and/or "Some optional features incompatible" + - The version where the last known changes occurred (likely the upcoming version if it is a new workflow) + - Link to the workflow on Dockstore (if applicable) - Workflow name linked to the information tab on Dockstore. +3. Format this information in a table. +4. Copy the previously gathered information to ==**ALL THREE**== overview tables in `docs/workflows_overview/`: + - `workflows_alphabetically.md` - Add the workflow in the appropriate spot based on the workflow name. + - `workflows_kingdom.md` - Add the workflow in the appropriate spot(s) based on the kingdom(s) the workflow is applicable to. Make sure it is added alphabetically within the appropriate subsection(s). + - `workflows_type.md` - Add the workflow in the appropriate spot based on the workflow type. Make sure it is added alphabetically within the appropriate subsection. +5. Copy the path to the workflow to ==**ALL**== of the appropriate locations in the `mkdocs.yml` file (under the `nav:` section) in the main directory of this repository. These should be the exact same spots as in the overview tables but without additional information. This ensures the workflow can be accessed from the navigation sidebar. + +## Standard Language & Formatting Conventions + +In order to maintain cohesive documentation, the following language and formatting conventions should be followed: + +### Language Conventions + +The following language conventions should be followed when writing documentation: + +- The documentation should be written in American English (sorry to our friends across the pond!) +- **The following variables should recieve the following descriptions**: + - `cpu` - Number of CPUs to allocate to the task + - `disk_size` - Amount of storage (in GB) to allocate to the task + - `docker` or `docker_image` - The Docker container to use for the task + - `memory` - Amount of memory/RAM (in GB) to allocate to the task + +### Formatting Conventions + +- **Bold Text** - Use `**bold text**` to indicate text that should be bolded. +- _Italicized Text_ - Use `_italicized text_` to indicate text that should be italicized. +- ==Highlighted Text== - Use `==highlighted text==` to indicate text that should be highlighted. +- `Code` - Use \`code\` to indicate text that should be formatted as code. +- ^^Underlined Text^^ - Use `^^underlined text^^` to indicate text that should be underlined (works with our theme; not all Markdown renderers support this). +- > Citations + - Use a `>` to activate quote formatting for a citation. Make sure to separate multiple citations with a comment line (``) to prevent the citations from running together. +- Callouts/Admonitions - These features are called "call-outs" in Notion, but are "Admonitions" in MkDocs. [I highly recommend referring to the Material for MkDocs documentation page on Admonitions to learn how best to use this feature](https://squidfunk.github.io/mkdocs-material/reference/admonitions/). Use the following syntax to create a callout: + + ```markdown + !!! note + This is a note. Observe I am indented with four spaces. + ``` + + Please see the [Admonition documentation](https://squidfunk.github.io/mkdocs-material/reference/admonitions/) for more information on how to change the title, enable toggles, and more. + + The following custom callout types are supported _in addition to the standard admonitions supported by our theme_ [more information here](https://squidfunk.github.io/mkdocs-material/reference/admonitions/#supported-types): + + !!! dna + This is a DNA admonition. Admire the cute green DNA emoji. You can create this with the `!!! dna` syntax. + + ???+ toggle + This is a toggle-able section. The emoji is an arrow pointing to the right downward. You can create this with the `??? toggle` syntax. I have added a `+` at the end of the question marks to make it open by default. + + ???+ task + This is a toggle-able section **for a workflow task**. The emoji is a gear. Use the `??? task` syntax to create this admonition. Use `!!! task` if you want to have it be permanently expanded. I have add a `+` at the end of the question marks to make this admonition open by default and still enable its collapse. + + !!! caption + This is a caption. The emoji is a painting. You can create this with the `!!! caption` syntax. This is used to enclose an image in a box and looks nice. A caption can be added beneath the picture and will also look nice. + + !!! techdetails + This is where you will put technical details for a workflow task. You can create this by `!!! techdetails` syntax. + +- Images - Use the following syntax to insert an image: + + ```markdown + !!! caption "Image Title" + ![Alt Text](/path/to/image.png) + ``` + +- Indentation - **_FOUR_** spaces are required instead of the typical two. This is a side effect of using this theme. If you use two spaces, the list and/or indentations will not render correctly. This will make your linter sad :( + + ```markdown + - first item + - second item + - third item + ``` + +- Tables - Use the following syntax to create a table + + ```markdown + | Header 1 | Header 2 | Header 3 | + |---|---|---| + | value 1 | value2 | value3 | + ``` + + Note that this is not a "pretty" markdown table. This is because the spacing would be crazy in the markdown file, especially for tables with a lot of text and/or columns. The table will render correctly in the documentation. + +- Links - Use the following syntax to create a link. This is works for both files and websites. If linking a file, use the relative path. + + ```markdown + [Link Text](https://www.example.com) + ``` + +- End all pages with an empty line diff --git a/docs/getting_started/commandline.md b/docs/getting_started/commandline.md new file mode 100644 index 000000000..a02eb07b7 --- /dev/null +++ b/docs/getting_started/commandline.md @@ -0,0 +1,346 @@ +# Getting Started with the Command-Line + +!!! dna "What is WDL?" + Running workflows on the command-line requires the direct use of the WDL (Workflow Development Language). As the name suggests, this is the workflow management language that is used to write and execute workflows. Frank has put together a great video describing 📺 [**WDL Task and Workflow Files**](https://www.youtube.com/watch?v=DNtdra59Y6o) and you can find full instructions below on running these WDL workflows. + +## Step 1: Obtain the Workflow and Data + +You will need to have access to the WDL workflow file (.wdl) and any associated input files (such as reference genomes, input data files, etc.). To do this, complete the following steps: + +### 1. Install Git (if not already installed) + +If you don't already have Git installed on your system, you will need to install it. Here's how you can install Git on some common operating systems: + +??? toggle "Linux (Ubuntu/Debian)" + + ```bash + sudo apt update + sudo apt install git + ``` + +??? toggle "macOS" + + Git is usually pre-installed on macOS. However, you can install or update it using Homebrew: + + ```bash + brew install git + ``` + +??? toggle "Windows" + + Download and install Git from the official website: + +### 2. Clone the Repository + +1. Open your terminal. +2. Create a directory where you want to store the cloned repository and navigate to it. + + ```bash + mkdir /path/to/your/desired/new/directory + cd /path/to/your/desired/new/directory + ``` + +3. Clone the repository from GitHub using the following command: + + ```bash + git clone https://github.com/theiagen/public_health_bioinformatics.git + ``` + +4. After running the command, Git will download all the repository files and set up a local copy in the directory you specified. + +### 3. Navigate to the Cloned Repository + +1. Change your working directory to the newly cloned repository: + + ```bash + cd public_health_bioinformatics + ``` + +2. You're now inside the cloned repository's directory. Here, you should find all the files and directories from the GitHub repository. + +### 4. Verify the Cloned Repository + +You can verify that the repository has been cloned successfully by listing the contents of the current directory using the `ls` (on Linux/macOS) or `dir` (on Windows) command: + +```bash +ls +``` + +This should display the files and directories within the repository. + +Congratulations! You've successfully cloned the repository from GitHub to your local command-line environment. You're now ready to proceed with running the bioinformatics analysis workflows using WDL as described in subsequent steps. + +## Step 2: Install docker and miniWDL + +Docker and miniwdl will be required for command-line execution. We will check if these are installed on your system and if not, install them now. + +1. Open your terminal. +2. Navigate to the directory where your workflow and input files are located using the `cd` command: + + ```bash + cd /path/to/your/workflow/directory + ``` + +3. Check if Docker is installed: + + ```bash + docker --version + ``` + + If Docker is not installed, follow the official installation guide for your operating system: [**https://docs.docker.com/get-docker/**](https://docs.docker.com/get-docker/) + +4. Check if **`miniwdl`** is installed: + + ```bash + miniwdl --version + ``` + + If **`miniwdl`** is not installed, you can install it using pip: + + ```bash + pip install miniwdl + ``` + +## Step 3: Set up the input.json file for your WDL workflow + +In a WDL (Workflow Description Language) workflow, an input JSON file is used to provide attributes (values/files etc) for input variables into the workflow. The names of the input variables must match the names of inputs specified in the workflow file. The workflow files can be found within the git repository that you cloned. Each input variable can have a specific type of attribute, such as String, File, Int, Boolean, Array, etc. Here's a detailed outline of how to specify different types of input variables in an input JSON file: + +??? toggle "String Input" + To specify a string input, use the name of the input variable as the key and provide the corresponding string value. Example: + + ```json + { + "sampleName": "VirusSample1", + "primerSequence": "ACGTGTCAG" + } + ``` + +??? toggle "File Input" + To specify a file input, provide the path to the input file relative to the directory where you run the `miniwdl` command. Example: + + ```json + { + "inputFastq": "data/sample.fastq", + "referenceGenome": "reference/genome.fasta" + } + + ``` + +??? toggle "Int Input" + To specify an integer input, provide the integer value. These do not require quotation marks. Example: + + ```json + { + "minReadLength": 50, + "maxThreads": 8 + } + ``` + +??? toggle "Boolean Input" + To specify a boolean input, use `true` or `false` (lowercase). Example: + + ```json + { + "useQualityFiltering": true, + "useDuplicateRemoval": false + } + ``` + +??? toggle "Array Input" + + To specify an array input, provide the values as an array. Example: + + ```json + { + "sampleList": ["Sample1", "Sample2", "Sample3"], + "thresholds": [0.1, 0.05, 0.01] + } + ``` + +## Step 4: Execute the Workflow + +Run the workflow using `miniwdl` with the following command, replacing `your_workflow.wdl` with the actual filename of your WDL workflow and `input.json` with the filename of your input JSON file. + +```bash +miniwdl run your_workflow.wdl --input input.json +``` + +## Step 5: Monitor Workflow Progress + +You can monitor the progress of the workflow by checking the console output for updates and log messages. This can help you identify any potential issues or errors during execution. + +??? tip "Tips for monitoring your workflow" + + ##### Tips for monitoring workflow progress {#tips-for-monitoring} + + After you've started the workflow using the **`miniwdl run`** command, you'll see various messages appearing in the terminal. These messages provide information about the various steps of the workflow as they are executed. Monitoring this output is crucial for ensuring that the workflow is progressing as expected. + + The console output will typically show: + + 1. **Task Execution:** You will see messages related to the execution of individual tasks defined in your workflow. These messages will include details about the task's name, input values, and progress. + 2. **Logging Information:** Workflow tasks often generate log messages to provide information about what they are doing. These logs might include details about software versions, input data, intermediate results, and more. + 3. **Execution Progress:** The output will indicate which tasks have completed and which ones are currently running. This helps you track the overall progress of the workflow. + 4. **Error Messages:** If there are any errors or issues during task execution, they will be displayed in the console output. These error messages can help you identify problems and troubleshoot them. + 5. **Timing Information:** You might also see timing information for each task, indicating how long they took to execute. This can help you identify tasks that might be taking longer than expected. + + **Example Console Output:** + + Here's an example of what the console output might look like while the workflow is running: + + ```bash + + Running: task1 + Running: task2 + Completed: task1 (Duration: 5s) + Running: task3 + Error: task2 (Exit Code: 1) + Running: task4 + ... + + ``` + + In this example, you can see that **`task1`** completed successfully in 5 seconds, but **`task2`** encountered an error and exited with a non-zero exit code. This kind of output provides insight into the progress and status of the workflow. + + **What to Look For:** + + As you monitor the console output, pay attention to: + + - **Successful Task Completion:** Look for messages indicating tasks that have completed successfully. This ensures that the workflow is progressing as intended. + - **Error Messages:** Keep an eye out for any error messages or tasks that exit with non-zero exit codes. These indicate issues that need attention. + - **Task Order:** The order of task messages can provide insights into the workflow's logic and execution flow. + - **Timing:** Notice how long each task takes to complete. If a task takes significantly longer than expected, it might indicate a problem. + + **Early Troubleshooting:** + + If you encounter errors or unexpected behavior, the console output can provide valuable information for troubleshooting. You can search for the specific error messages to understand the problem and take appropriate action, such as correcting input values, adjusting parameters, or addressing software dependencies. + + Monitoring the workflow progress through the console output is an essential practice for successful execution. It allows you to track the status of individual tasks, identify errors, and ensure that your analysis is proceeding as planned. Regularly reviewing the output will help you address any issues and improve the efficiency of your bioinformatics workflow. + +??? tip "What to do if you need to cancel a run" + ##### Canceling a Running Workflow {#canceling-a-run} + + Canceling a running workflow is an important step in case you need to stop the execution due to errors, unexpected behavior, or any other reason. If you're using `miniwdl` to run your workflow, here's how you can cancel a workflow run while it's in progress: + + 1. **Ctrl + C**: The simplest way to cancel a running command in the terminal is to press `Ctrl + C`. This sends an interrupt signal to the running process, which should gracefully terminate it. However, keep in mind that this might not work for all scenarios, and some tasks might not be able to cleanly terminate. + 2. **Terminate Docker Containers**: If your workflow involves Docker containers, you might need to ensure that any Docker containers launched by the workflow are also terminated. To do this, you can manually stop the Docker containers associated with the workflow. You can use the `docker ps` command to list running containers and `docker stop ` to stop a specific container. + 3. **Kill the miniwdl Process**: If the `Ctrl + C` approach doesn't work, you might need to explicitly kill the `miniwdl` process running in the terminal. To do this, you can use the `kill` command. First, find the process ID (PID) of the `miniwdl` process by running: + + ```bash + ps aux | grep miniwdl + + ``` + + Identify the PID in the output and then run: + + ```bash + kill -9 + + ``` + + This forcefully terminates the process. + + 4. **Clean Up Intermediate Files**: Depending on the workflow and how tasks are structured, there might be intermediate files or resources that were generated before the cancellation. You might need to manually clean up these files to free up disk space. + 5. **Check for Workflow-Specific Cancellation**: Some workflows might have specific mechanisms to handle cancellation. Refer to the workflow documentation or user guide to understand if there's a recommended way to cancel the workflow gracefully. + 6. **Check for Any Remaining Resources**: After canceling the workflow, it's a good practice to check for any remaining resources that might need to be cleaned up. This could include temporary files, Docker images, or other resources that were created during the workflow's execution. + + Remember that canceling a workflow might leave the system in an inconsistent state, especially if some tasks were partially executed. After canceling, it's a good idea to review the output and logs to identify any cleanup actions you might need to take. + + It's important to approach workflow cancellation carefully, as abruptly terminating processes can potentially lead to data loss or other unintended consequences. Always make sure you understand the workflow's behavior and any potential side effects of cancellation before proceeding. + +## Step 6: Review Output + +Once the workflow completes successfully, you will find the output files and results in the designated output directory as defined in your WDL workflow. + +??? toggle "Substep 1: Locate the Output Directory" + + Before you begin reviewing outputs, make sure you know where the output directory of your workflow is located. This is typically specified in the workflow configuration or input JSON file. Navigate to this directory using the **`cd`** command in your terminal. + + ```bash + cd /path/to/your/output/directory + ``` + +??? toggle "Substep 2: Logs" + + Logs are a valuable source of information about what happened during each step of the workflow. Each task in the workflow might generate its own log file. Here's how to review logs: + + 1. Use the **`ls`** command to list the files in the output directory: + + ```bash + + ls + + ``` + + 2. Look for log files with names that correspond to the tasks in your workflow. These files often have a **`.log`** extension. + 3. Open a log file using a text editor like **`less`** or **`cat`**: + + ```bash + + less task_name.log + + ``` + + Use the arrow keys to navigate through the log, and press **`q`** to exit. + + 4. Inspect the log for messages related to the task's execution, input values, software versions, and any errors or warnings that might have occurred. + +??? toggle "Substep 3: stderr (Standard Error) and stdout (Standard Output)" + + stderr and stdout are streams where processes write error messages and standard output, respectively. These are often redirected to files during workflow execution. Here's how to review them: + + 1. Use the **`ls`** command to list the files in the output directory. + 2. Look for files with names like **`task_name.err`** (for stderr) and **`task_name.out`** (for stdout). + 3. Open the files using a text editor: + + ```bash + + less task_name.err + less task_name.out + + ``` + + These files might contain additional information about the task's execution, errors, and output generated during the analysis. + +??? toggle "Substep 4: Reviewing Output Files" + + Workflow tasks might generate various types of output files, such as plots, reports, or data files. Here's how to review them: + + 1. Use the **`ls`** command to list the files in the output directory. + 2. Identify the files generated by your workflow tasks. + 3. Depending on the file type, you can use different tools to open and view them. For example, you might use **`less`** or a text editor for text-based files, or an image viewer for image files. + +??? toggle "Substep 5: Interpretation and Troubleshooting" + + As you review the outputs, keep these points in mind: + + - **Successful Execution:** Look for indicators of successful task execution, such as expected messages, correct output files, and absence of error messages. + - **Errors and Warnings:** Pay close attention to any error or warning messages in logs, stderr, or stdout. These can help you identify issues that need troubleshooting. + - **Input Values and Parameters:** Verify that input values and parameters were correctly passed to tasks. Incorrect input can lead to unexpected behavior. + - **Software Versions:** Check if the versions of the tools and software used in the workflow match what you expected. + - **Intermediate Outputs:** Review intermediate outputs generated by tasks. These might provide insights into the workflow's progress and results. + +??? toggle "Substep 6: Make Notes and Take Action" + + As you review the outputs, make notes of any issues, errors, or unexpected behavior you encounter. Depending on the severity of the issues, you might need to: + + - Adjust input parameters. + - Re-run specific tasks. + - Debug and troubleshoot errors. + - Consult the workflow documentation. + - Reach out to the Theiagen Genomics bioinformatics experts for assistance. () + +!!! info "Output Review Conclusion" + + Reviewing the outputs of your bioinformatics workflow is a critical step to ensure the quality of your analysis. Logs, stderr, stdout, and generated output files provide valuable insights into the execution process and results. By carefully reviewing these outputs and addressing any issues, you can enhance the reliability and accuracy of your bioinformatics analysis. + +## Step 7: Troubleshooting and Debugging + +1. If the workflow encounters errors or fails to execute properly, review the error messages in the terminal. +2. Check for any missing input files, incorrect paths, or issues related to software dependencies. +3. Double-check your input JSON file to ensure that all required inputs are correctly specified. + +Congratulations! You have successfully executed a bioinformatics analysis workflow using WDL on the command-line. This tutorial covered the basic steps to run a WDL workflow using the `miniwdl` command-line tool. + +Remember that the specific steps and commands might vary depending on the details of your workflow, software versions, and environment. Be sure to consult the documentation for `miniwdl`, WDL, and any other tools you're using for more advanced usage and troubleshooting. + +Happy analyzing! diff --git a/docs/getting_started/terra.md b/docs/getting_started/terra.md new file mode 100644 index 000000000..616dd7f02 --- /dev/null +++ b/docs/getting_started/terra.md @@ -0,0 +1,98 @@ +# Getting Started with Terra + +!!! dna "Our Approach" + Theiagen’s approach to genomic analysis in public health typically uses the [Terra](https://terra.bio/) platform to run workflows that undertake bioinformatic analysis, then uses other platforms for visualization of the resulting data. This is described in more depth in our paper [_Accelerating bioinformatics implementation in public health_](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.001051), and the application of this approach for genomic surveillance of SARS-CoV-2 in California is described in the paper [_Pathogen genomics in public health laboratories: successes, challenges, and lessons learned from California’s SARS-CoV-2 Whole-Genome Sequencing Initiative, California COVIDNet_](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.001027). + +!!! tip "" + **When undertaking genomic analysis using Terra and other data visualization platforms, it is essential to consider the necessary and appropriate workflows and resources for your analysis. To help you make these choices, take a look at the relationship between the most commonly used Theiagen workflows, and the descriptions of the major stages in genomic data analysis below.** + + !!! caption "Analysis Approaches for Genomic Data" + ![The relationship between the various PHB workflows](../assets/figures/Workflow_Relationships.png#only-light){data-description="This diagram shows the Theiagen workflows (green boxes) available for analysis of genomic data in public health and the workflows that may be used consecutively (arrows). The blue boxes describe the major functions that these workflows undertake. The yellow boxes show functions that may be undertaken independently of workflows on Terra."} + ![The relationship between the various PHB workflows](../assets/figures/Workflow_Relationships_dark.png#only-dark){data-description="This diagram shows the Theiagen workflows (green boxes) available for analysis of genomic data in public health and the workflows that may be used consecutively (arrows). The blue boxes describe the major functions that these workflows undertake. The yellow boxes show functions that may be undertaken independently of workflows on Terra."} + + This diagram shows the Theiagen workflows (green boxes) available for analysis of genomic data in public health and the workflows that may be used consecutively (arrows). The blue boxes describe the major functions that these workflows undertake. The yellow boxes show functions that may be undertaken independently of workflows on Terra. + +## Data Import to Terra + +To start using Terra for data analysis, you will first need to import your data into your workspace. There are multiple ways to do this: + +- **Using Terra’s native features to upload data from your local computer or link to data that’s already in a Google bucket** +- Data import workflows + - Using the [SRA_Fetch](../workflows/data_import/sra_fetch.md) workflow to import publicly available data from any repository in the [INSDC](https://www.insdc.org/) (including with [SRA](https://www.ncbi.nlm.nih.gov/sra), [ENA](https://www.ebi.ac.uk/ena/browser/home) and [DRA](https://www.ddbj.nig.ac.jp/dra/index-e.html)) + - Using the [Assembly_Fetch ](../workflows/data_import/assembly_fetch.md) workflow to import publicly available genome assemblies from [NCBI](https://www.ncbi.nlm.nih.gov/datasets/) + - Using the [BaseSpace_Fetch](../workflows/data_import/basespace_fetch.md) workflow to import data from your [Illumina BaseSpace](https://basespace.illumina.com/) account + - Using the [Create_Terra_Table](../workflows/data_import/create_terra_table.md) workflow to help create your data table after manual upload to your Terra workspace (or a Google Cloud Storage Bucket) + +!!! example "SOPs for ==importing data into a Terra workspace==" + + | SOP | SOP Version | PHB Version Compatibility | + |---|---|---| + | [Uploading Data, Creating Metadata Tables and TSV files, and Importing Workflows](../assets/sops/TG-TER-03_GettingStartedInTerra_v3.pdf) | v3 | v1.3.0, v2+ | + | [Linking BaseSpace and Importing BaseSpace Reads to Terra](../assets/sops/TG-TER-04_BaseSpaceFetch_v2.pdf) | v3 | v1.3.0, v2+ | + +## Genome assembly, QC, and characterization + +### TheiaX workflows + +The TheiaX workflows are used for genome assembly, quality control, and characterization. The [TheiaCoV Workflow Series](../workflows/genomic_characterization/theiacov.md), [TheiaProk Workflow Series](../workflows/genomic_characterization/theiaprok.md), and [TheiaEuk Workflow Series](../workflows/genomic_characterization/theiaeuk.md) workflows are intended for viral, bacterial, and fungal pathogens, respectively. [TheiaMeta Workflow Series](../workflows/genomic_characterization/theiameta.md) is intended for the analysis of a single taxon from metagenomic data. + +!!! example "SOPs for the ==TheiaX workflows==" + + ??? toggle "For analyzing ==SARS-CoV-2==" + | SOP | SOP Version | PHB Version Compatibility | + |---|---|---| + | [Analyze SARS-COV-2 using TheiaCoV_Illumina_PE_PHB](../assets/sops/TG-SC2-PE_SC2_TheiaCoV_IlluminaPE_v3.pdf) | v3 | v2+ | + | [Analyze SARS-COV-2 using TheiaCoV_Illumina_SE_PHB](../assets/sops/TG-SC2-SE_SC2_TheiaCoV_IlluminaSE_v3.pdf) | v3 | v2+ | + | [Analyze SARS-COV-2 using TheiaCoV_ClearLabs](../assets/sops/TG-SC2-CL_SC2_ClearLabs_v3.pdf) | v3 | v2+ | + | [Analyze SARS-COV-2 using TheiaCoV_ONT](../assets/sops/TG-SC2-ONT_Analyzing_SC2_Using_TheiaCov_ONT_PHB_v2.pdf) | v2 | v1.x+ | + | [Analyzing SARS-CoV-2 using TheiaCoV_FASTA](../assets/sops/TG-SC2-FST_Analyzing_SC2_Using_TheiaCoV_FASTA_PHB_v2.pdf) | v2 | v1.x+ | + + ??? toggle "For analyzing ==influenza==" + | SOP | SOP Version | PHB Version Compatibility | + |---|---|---| + | [Analyzing Flu Data in Terra using TheiaCov_Illumina_PE and Augur Workflows](../assets/sops/TG-FLU-PE_SOP_Flu_IlluminaPE_v1.pdf) | v1 | v1.x+ | + +### Quality evaluation + +The TheiaX workflows will generate various quality metrics. These should be evaluated relative to quality thresholds that have been agreed upon within your laboratory or sequencing program and define the sufficient quality characteristics for a genome and sequence data to be used. For the [TheiaCoV Workflow Series](../workflows/genomic_characterization/theiacov.md), [TheiaProk Workflow Series](../workflows/genomic_characterization/theiaprok.md), and [TheiaEuk Workflow Series](../workflows/genomic_characterization/theiaeuk.md) workflows, this quality evaluation may be undertaken using the optional `QC_check` task. Full instructions for the use of this task may be found on the relevant workflow page. Some quality metrics are not evaluated by the `QC_check` task and should be evaluated manually. + +Genomes that fail to meet agreed quality thresholds should not be used. Results for characterization of these genomes may be inaccurate or unreliable. The inclusion of poor-quality genomes in downstream comparative analyses will bias their results. Samples that fail to meet QC thresholds will need to be re-sequenced and sample processing may need to be repeated (e.g. culture-based isolation of clonal bacteria, DNA/RNA extraction, and processing for sequencing). + +### Update workflows for SARS-CoV-2 genomes + +Workflows are available for updating the Pangolin and VADR assignments made to SARS-CoV-2 genomes. The [Pangolin Update](../workflows/genomic_characterization/pangolin_update.md) workflow accounts for the delay in assigning names to newly emerging lineages that you may have already sequenced. The [VADR_Update](../workflows/genomic_characterization/vadr_update.md) workflow similarly accounts for features that have been newly identified in SARS-CoV-2 genomes when assessing genome quality with VADR. + +## Phylogenetics + +### Phylogenetic construction + +Phylogenetic trees are constructed to assess the evolutionary relationships between sequences in the tree. These evolutionary relationships are often used as a proxy for epidemiological relationships, and sometimes for inferring transmission between isolation sources. + +There are various methods for constructing phylogenetic trees, depending on the sequencing data being used, the organism being analyzed and how it evolved, what you would like to infer from the tree, and the computational resources available for the tree construction. Theiagen has a number of workflows for constructing phylogenetic trees. For full details of these workflows, please see [Guide to Phylogenetics](https://www.notion.so/Guide-to-Phylogenetics-c997fe59e3f0423aa8a73eeccccd1b92?pvs=21) which includes advice on the appropriate tree-building workflows and phylogenetic visualization approaches. + +!!! example "SOPs for ==phylogenetic construction==" + | SOP | SOP Version | PHB Version Compatibility | + |---|---|---| + | [Analyzing Flu Data in Terra using TheiaCov_Illumina_PE and Augur Workflows](../assets/sops/TG-FLU-PE_SOP_Flu_IlluminaPE_v1.pdf) | v1 | v1.x+ | + | [Analyzing Phylogenetic Relationships in Terra using Theiagen’s Augur Workflows](../assets/sops/TG-AUGUR-01_Augur_v1.pdf) | v1 | v1.x+ | + +### Phylogenetic placement + +Phylogenetic placement is used to place your own sequences onto an existing phylogenetic tree. This may be used to find the closest relatives to your sequence(s). More details, including phylogenetic visualization approaches can be found in [Guide to Phylogenetics](https://www.notion.so/Guide-to-Phylogenetics-c997fe59e3f0423aa8a73eeccccd1b92?pvs=21) + +## Public Data Sharing + +!!! example "SOPs for ==data submissions==" + | SOP | SOP Version | PHB Version Compatibility | + |---|---|---| + | [Submitting SC2 Sequence Data to GISAID using Theiagen’s Terra 2 GISAID Workflow](../assets/sops/TG-GISAID-01_Terra2GISAID_v2.pdf) | v2 | v2+ | + +## SARS-CoV-2 Metagenomic Analysis + +!!! example "SOPs for ==SARS-CoV-2 metagenomic data analysis==" + | SOP | SOP Version | PHB Version Compatibility | + |---|---|---| + | [Analyzing SARS-CoV-2 Metagenomic Samples using Freyja FASTQ](../assets/sops/TG-FREY-01_FreyjaFASTQ_v2.pdf) | v2 | v2+ | + | [Plotting SARS-CoV-2 Metagenomic Sample Data using Freyja Plot](../assets/sops/TG-FREY-02_FreyjaPlot_v3.pdf) | v3 | v2+ | + | [Creating a Dashboard Visualization of SARS-CoV-2 Metagenomic Samples using Freyja Dashboard](../assets/sops/TG-FREY-03-SOP_FreyjaDashboard_v2.pdf) | v2 | v2+ | + | [Creating Static Reference Files for Freyja Analysis in Terra using Freyja Update](../assets/sops/TG-FREY-04_FreyjaUpdate_v2.pdf) | v2 | v2+ | diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..492fcccd9 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,121 @@ +--- +title: Home +--- + +## Purpose & Workflows + +The PHB repository contains workflows for the characterization, genomic epidemiology, and sharing of pathogen genomes of public health concern. Workflows are available for viruses, bacteria, and fungi. + +All workflows in the PHB repository end with `_PHB` in order to differentiate them from earlier versions and from the original tools they +incorporate. + +
[Explore our workflows](workflows_overview/workflows_type.md){ .md-button .md-button--primary }
+ +
+ +-
[Command-line Users](getting_started/commandline.md){ .md-button .md-button--secondary }
+ + --- + + Learn how to use our workflows on the command-line! + +-
[Terra Users](getting_started/terra.md){ .md-button .md-button--secondary } + + --- + + Learn how to use our workflows on Terra! + +
+ +!!! dna "Our Open Source Philosophy" + PHB source code is publicly available on [GitHub](https://github.com/theiagen/public_health_bioinformatics) and available under [GNU Affero General Public License v3.0](https://github.com/theiagen/public_health_viral_genomics/blob/main/LICENSE)! + + All workflows can be imported directly to [Terra](https://terra.bio/) via the [**Dockstore PHB collection**](https://dockstore.org/organizations/Theiagen/collections/public-health-bioinformatics)! + + You can also use our workflows on the command-line. Please see our guide on how to get started [**here**](getting_started/commandline.md)! + +When undertaking genomic analysis using the command-line, via Terra, or other data visualization platforms, it is essential to consider the necessary and appropriate workflows and resources for your analysis. To help you make these choices, take a look at the relationship between the most commonly used Theiagen workflows. + +!!! caption "Analysis Approaches for Genomic Data" + ![The relationship between the various PHB workflows](assets/figures/Workflow_Relationships.png#only-light){data-description="This diagram shows the Theiagen workflows (green boxes) available for analysis of genomic data in public health and the workflows that may be used consecutively (arrows). The blue boxes describe the major functions that these workflows undertake. The yellow boxes show functions that may be undertaken independently of workflows on Terra."} + ![The relationship between the various PHB workflows](assets/figures/Workflow_Relationships_dark.png#only-dark){data-description="This diagram shows the Theiagen workflows (green boxes) available for analysis of genomic data in public health and the workflows that may be used consecutively (arrows). The blue boxes describe the major functions that these workflows undertake. The yellow boxes show functions that may be undertaken independently of workflows on Terra."} + + This diagram shows the Theiagen workflows (green boxes) available for analysis of genomic data in public health and the workflows that may be used consecutively (arrows). The blue boxes describe the major functions that these workflows undertake. The yellow boxes show functions that may be undertaken independently of workflows on Terra. + +### PHB development is a cycle + +We continuously work to improve our codebase and usability of our workflows by the public health community, so changes from version to version are expected. This documentation page reflects the state of the workflow at the version stated in the title. + +!!! dna "What's new?" + You can see the changes since PHB v2.1.0 [**here**](https://theiagen.notion.site/Public-Health-Bioinformatics-v2-2-0-Minor-Release-Notes-9b2781f27b8d4b69949f8fc1ef04868d?pvs=4)! + +## Contributing to the PHB Repository + +We warmly welcome contributions to this repository! Our style guide may be found [here](contributing/code_contribution.md) for convenience of formatting. + +If you would like to submit suggested code changes to our workflows, you may add or modify the WDL files and submit pull requests to the [PHB GitHub](https://github.com/theiagen/public_health_bioinformatics) repository. + +You can expect a careful review of every PR and feedback as needed before merging, just like we do for PRs submitted by the Theiagen team. Our PR template can help prepare you for the review process. As always, reach out with any questions! We love recieving feedback and contributions from the community. When your PR is merged, we'll add your name to the contributors list below! + +## Authorship & Responsibility + +### Authorship + +(Ordered by contribution [# of lines changed] as of 2024-08-01) + +- **Sage Wright** ([@sage-wright](https://github.com/sage-wright)) - Conceptualization, Software, Validation, Supervision +- **Inês Mendes** ([@cimendes](https://github.com/cimendes)) - Software, Validation +- **Curtis Kapsak** ([@kapsakcj](https://github.com/kapsakcj)) - Conceptualization, Software, Validation +- **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - Software, Validation +- **Frank Ambrosio** ([@frankambrosio3](https://github.com/frankambrosio3)) - Conceptualization, Software, Validation +- **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation +- **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision +- **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation +- **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision +- **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision +- **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation +- **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) - Validation +- **Joel Sevinsky** ([@sevinsky](https://github.com/sevinsky)) - Conceptualization, Project Administration, Supervision + +### External Contributors + +We would like to gratefully acknowledge the following individuals from the public health community for their contributions to the PHB repository: + +- **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) +- **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) +- **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) +- **Holly Halstead** ([@HNHalstead](https://github.com/HNHalstead)) + +### On the Shoulder of Giants + +The PHB repository would not be possible without its predecessors. We would like to acknowledge the following repositories, individuals, and contributors for their influence on the development of these workflows: + +The PHB repository originated from collaborative work with Andrew Lang, PhD & his [Genomic Analysis WDL workflows](https://github.com/AndrewLangvt/genomic_analyses). The workflows and task development were influenced by The Broad's [Viral Pipes](https://github.com/broadinstitute/viral-pipelines) repository. The TheiaCoV workflows for viral genomic characterization were influenced by UPHL's [Cecret](https://github.com/UPHL-BioNGS/Cecret) & StaPH-B's [Monroe](https://staph-b.github.io/staphb_toolkit/workflow_docs/monroe/) workflows. The TheiaProk workflows for bacterial genomic characterization were influenced by Robert Petit's [bactopia](https://github.com/bactopia/bactopia). Most importantly, the PHB user community drove the development of these workflows and we are grateful for their feedback and contributions. + +If you would like to provide feedback, please raise a [GitHub issue](https://github.com/theiagen/public_health_bioinformatics/issues/new) or contact us at . + +### Maintaining PHB Pipelines + +Theiagen Genomics has committed to maintaining these workflows for the forseeable future. These workflows are written using a standard workflow language (WDL) and uses Docker images based on the [StaPHB-B Docker Builds](https://github.com/StaPH-B/docker-builds). New versions that include bug fixes and additional features are released on a quarterly bases, with urgent bug fixes released as needed. Each version is accompanied by detailed release notes to lower the barrier of pipeline upkeep from the public health community at large. + +### Point of Contact + +If you have any questions or concerns, please raise a [GitHub issue](https://github.com/theiagen/public_health_bioinformatics/issues/new) or email Theiagen's general support at . + +### Conflict of Interest + +The authors declare no conflict of interest. + +## Citation + +Please cite this paper if publishing work using any workflows: + +> Libuit, Kevin G., Emma L. Doughty, James R. Otieno, Frank Ambrosio, Curtis J. Kapsak, Emily A. Smith, Sage M. Wright, et al. 2023. "Accelerating Bioinformatics Implementation in Public Health." Microbial Genomics 9 (7). . + +Alternatively, please cite this paper if using the TheiaEuk workflow: + +> Ambrosio, Frank, Michelle Scribner, Sage Wright, James Otieno, Emma Doughty, Andrew Gorzalski, Danielle Siao, et al. 2023. "TheiaEuk: A Species-Agnostic Bioinformatics Workflow for Fungal Genomic Characterization." Frontiers in Public Health 11. . + +## About Theiagen + +Theiagen develops bioinformatics solutions for public health labs, and then trains and supports scientists to use these. If you would like to work with Theiagen, please [get in contact](https://theiagen.com/team-up-with-theiagen/). diff --git a/docs/javascripts/tablesort.js b/docs/javascripts/tablesort.js new file mode 100644 index 000000000..69c0ff3e8 --- /dev/null +++ b/docs/javascripts/tablesort.js @@ -0,0 +1,6 @@ +document$.subscribe(function() { + var tables = document.querySelectorAll("article table:not([class])") + tables.forEach(function(table) { + new Tablesort(table) + }) +}) \ No newline at end of file diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 000000000..54a833dfd --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block outdated %} + You're not viewing the latest version release. + + Click here to go to the latest version release. + +{% endblock %} + + +{% block announce %} +
🏗️ I'm under construction! Pardon the dust while we remodel! 👷
+{% endblock %} diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 000000000..0ff83e58d --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,188 @@ +/* color scheme */ +[data-md-color-scheme="light"] { + --md-primary-bg-color: #ffffff; + --md-primary-fg-color: #116eb7; + --md-accent-bg-color: #ffffff; + --md-accent-fg-color: #1da74a; + th { + background: #ffffff; + color: #262626; + } +} +[data-md-color-scheme="light"] img[src$="#only-dark"], +[data-md-color-scheme="light"] img[src$="#gh-dark-mode-only"] { + display: none; /* Hide dark images in light mode */ +} +[data-md-color-scheme="slate"] img[src$="#only-light"], +[data-md-color-scheme="slate"] img[src$="#gh-light-mode-only"] { + display: none; /* Hide light images in dark mode */ +} +[data-md-color-scheme="slate"] { + --md-primary-bg-color: #1da74a; + --md-primary-fg-color: #262626; + --md-accent-bg-color: #ffffff; + --md-accent-fg-color: #1da74a; + th { + background: #1F2029; + color: #ffffff; + } +} + +/* dna admonition */ +:root { + --md-admonition-icon--dna: url('data:image/svg+xml;charset=utf-8,') +} +.md-typeset .admonition.dna, +.md-typeset details.dna { + border-color: rgb(43, 155, 70); +} +.md-typeset .dna > .admonition-title, +.md-typeset .dna > summary { + background-color: rgba(43, 155, 70, 0.1); +} +.md-typeset .dna > .admonition-title::before, +.md-typeset .dna > summary::before { + background-color: #1da74a; + -webkit-mask-image: var(--md-admonition-icon--dna); + mask-image: var(--md-admonition-icon--dna); +} +.md-typeset .dna > h5 { + /* this renders the heading readable by screen readers but does not display it */ + margin: -1px; padding: 0; border: 0; + visibility: hidden; + margin-top: -15px; /* this makes the header in the TOC jump to the top of the admonition box */ + position: relative; +} + +/* technical details admonition */ +:root { + --md-admonition-icon--techdetails: url('data:image/svg+xml;charset=utf-8,') +} +.md-typeset .admonition.techdetails, +.md-typeset details.techdetails { + border-color: rgb(70, 18, 116); +} +.md-typeset .admonition.techdetails { + max-width: auto; + margin-inline: auto; +} +.md-typeset .techdetails > .admonition-title, +.md-typeset .techdetails > summary { + background-color: rgba(132, 89, 168, 0.1) +} +.md-typeset .techdetails > .admonition-title::before, +.md-typeset .techdetails > summary::before { + background-color: rgb(70, 18, 116);; + -webkit-mask-image: var(--md-admonition-icon--techdetails); + mask-image: var(--md-admonition-icon--techdetails); +} + +/* text toggle admonition */ +:root { + --md-admonition-icon--toggle: url('data:image/svg+xml;charset=utf-8,') +} +.md-typeset .admonition.toggle, +.md-typeset details.toggle { + border-color: rgba(255, 255, 255, 0); +} +.md-typeset .toggle > .admonition-title, +.md-typeset .toggle > summary { + background-color: rgba(255, 255, 255, 0); +} +.md-typeset .toggle > .admonition-title::before, +.md-typeset .toggle > summary::before { + background-color: rgb(125, 125, 125); + -webkit-mask-image: var(--md-admonition-icon--toggle); + mask-image: var(--md-admonition-icon--toggle); +} +.md-typeset .toggle > h5 { + /* this renders the heading readable by screen readers but does not display it */ + margin: -1px; padding: 0; border: 0; + visibility: hidden; + margin-top: -15px; /* this makes the header in the TOC jump to the top of the admonition box */ + position: relative; +} + +/* task toggle admonition */ +:root { + --md-admonition-icon--task: url('data:image/svg+xml;charset=utf-8,') +} +.md-typeset .admonition.task, +.md-typeset details.task { + border-color: rgba(255, 255, 255, 0); +} +.md-typeset .task > .admonition-title, +.md-typeset .task > summary { + background-color: rgba(255, 255, 255, 0); +} +.md-typeset .task > .admonition-title::before, +.md-typeset .task > summary::before { + background-color: rgb(125, 125, 125); + -webkit-mask-image: var(--md-admonition-icon--task); + mask-image: var(--md-admonition-icon--task); +} +.md-typeset .task > h5 { + /* this renders the heading readable by screen readers but does not display it */ + margin: -1px; padding: 0; border: 0; + visibility: hidden; + margin-top: -15px; /* this makes the header in the TOC jump to the top of the admonition box */ + position: relative; +} + +/* caption admonition */ +:root { + --md-admonition-icon--caption: url('data:image/svg+xml;charset=utf-8,') +} +.md-typeset .admonition.caption, +.md-typeset details.caption { + border-color: rgb(135, 142, 143); +} +.md-typeset .caption > .admonition-title, +.md-typeset .caption > summary { + background-color: rgba(72, 119, 127, 0.1); +} +.md-typeset .caption > .admonition-title::before, +.md-typeset .caption > summary::before { + background-color: rgb(135, 142, 143); + -webkit-mask-image: var(--md-admonition-icon--caption); + mask-image: var(--md-admonition-icon--caption); +} +.md-typeset .caption > h5 { + /* this renders the heading readable by screen readers but does not display it */ + margin: -1px; padding: 0; border: 0; + visibility: hidden; + margin-top: -15px; /* this makes the header in the TOC jump to the top of the admonition box */ + position: relative; +} + +/* edit the tip admonition */ +.md-typeset .tip > h5 { + /* this renders the heading readable by screen readers but does not display it */ + margin: -1px; padding: 0; border: 0; + visibility: hidden; + margin-top: -15px; /* this makes the header in the TOC jump to the top of the admonition box */ + position: relative; +} + +/* content width */ +.md-grid { + max-width: initial; +} + +/* scrollbar and resizable columns in tables */ +table { + overflow-y: scroll; + max-height: 500px; + display: block; +} +th { + position: sticky; + top: 0; + resize: horizontal; + overflow: auto; +} +td { + word-break: break-all; +} + + diff --git a/docs/workflows/data_export/concatenate_column_content.md b/docs/workflows/data_export/concatenate_column_content.md new file mode 100644 index 000000000..5534128ef --- /dev/null +++ b/docs/workflows/data_export/concatenate_column_content.md @@ -0,0 +1,40 @@ +--- +title: Concatenate_Column_Content +--- + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Exporting Data From Terra](../../workflows_overview/workflows_type.md/#exporting-data-from-terra) | [Any taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.1.0 | Yes | Set-level | + +## Concatenate_Column_Content_PHB + +This set-level workflow will create a file containing all of the items from a given column in a Terra Data Table. This is useful when you want to investigate many results files. There is a video available with more information about the Concatenate_Column_Content workflow: **📺 [Workflow Focus: Concatenate_Column_Content](https://www.youtube.com/watch?v=T5Gnj9BtC9I)** + +### Inputs + +This workflow runs on the set level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| concatenate_column_content | **concatenated_file_name** | String | The name of the output file. ***Include the extension***, such as ".fasta" or ".txt". | | Required | +| concatenate_column_content | **files_to_cat** | Array[File] | The column that has the files you want to concatenate. | | Required | +| cat_files | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| cat_files | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| cat_files | **docker_image** | String | The Docker container to use for the task |s-docker.pkg.dev/general-theiagen/theiagen/utility:1.1" | Optional | +| cat_files | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| cat_files | **skip_extra_headers** | Boolean | If the files you are concatenating have identical headers, you can include only the first instance of the header and skip all of the others so they do not appear duplicated in the concatenated file. To activate this, set to true. | false | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +!!! info + Please note that if you run this workflow on the same Terra set (the same group of samples can be included in multiple Terra sets), the results will overwrite each other. We recommend either (1) renaming the output variable, or (2) creating a new set every time you run the workflow. + +| **Variable** | **Type** | **Description** | +|---|---|---| +| concatenated_files | File | The file containing all of the items from the column you selected. | +| concatenate_column_content_version | String | The version of the repository the workflow is hosted in | +| concatenate_column_content_analysis_date | String | The date the workflow was run | diff --git a/docs/workflows/data_export/transfer_column_content.md b/docs/workflows/data_export/transfer_column_content.md new file mode 100644 index 000000000..475e4bfb6 --- /dev/null +++ b/docs/workflows/data_export/transfer_column_content.md @@ -0,0 +1,48 @@ +--- +title: Transfer_Column_Content +--- + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Exporting Data From Terra](../../workflows_overview/workflows_type.md/#exporting-data-from-terra) | [Any taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v1.3.0 | Yes | Set-level | + +## Transfer_Column_Content_PHB + +This set-level workflow will transfer all of the items from a given column in a Terra Data Table to a single GCP storage bucket location. This is useful when you want to transfer many files to another GCP storage bucket (can be a Terra workspace storage bucket or a non-Terra storage bucket). + +!!! note + This workflow requires that the user's Terra pet-service account has sufficient privileges to read and write to the target storage bucket. + + - If the target bucket **is associated with a Terra workspace**, the workspace OWNER/administrator must grant WRITER privileges with the Terra workspace. + - If the target bucket **is not associated with a Terra workspace** (i.e. GCP storage bucket), the user's Terra pet-service account (or their Terra PROXY account) must be granted the ability to read and write to the bucket (Storage Object Admin google privileges) + +!!! note + If using Transfer_column_content workflow version 1.3.0 or higher, the call-caching feature of Terra has been DISABLED to ensure that the workflow is run from the beginning and data is transferred fresh. Call-caching will not be enabled, even if the user checks the box ✅ in the Terra workflow interface. + +### Inputs + +This workflow runs on the set level. + +| **Terra Task name** | **input_variable** | **Type** | **Description** | **Default attribute** | **Status** | +|---|---|---|---|---|---| +| transfer_column_content | **files_to_transfer** | Array[File] | The column that has the files you want to concatenate. | | Required | +| transfer_column_content | **target_bucket** | String | The GS URI of the target storage bucket. Note: **Do not include spaces**, but **do** include the `gs://` at the beginning of the bucket URI | | Required | +| transfer_files | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| transfer_files | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| transfer_files | **docker_image** | String | The docker image used to perform the file transfer. | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1" | Optional | +| transfer_files | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +!!! info + Please note that if you run this workflow on the same Terra set (the same group of samples can be included in multiple Terra sets), the results will overwrite each other. We recommend either (1) renaming the output variable, or (2) creating a new set every time you run the workflow. + +| **Variable** | **Type** | **Description** | +|---|---|---| +| transferred_files | File | A list of all of the files now located at the target bucket location (GSURI) | +| transfer_column_content_version | String | The version of the repository the workflow is hosted in | +| transfer_column_content_analysis_date | String | The date the workflow was run | diff --git a/docs/workflows/data_export/zip_column_content.md b/docs/workflows/data_export/zip_column_content.md new file mode 100644 index 000000000..3a48f18ad --- /dev/null +++ b/docs/workflows/data_export/zip_column_content.md @@ -0,0 +1,39 @@ +--- +title: Zip_Column_Content +--- + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** ||||| +|---|---|---|---|---| +| [Exporting Data From Terra](../../workflows_overview/workflows_type.md/#exporting-data-from-terra) | [Any taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.1.0 | Yes | Set-level | + +## Zip_Column_Content_PHB + +This workflow will create a zip file that contains all of the items in a column in a Terra Table. + +### Inputs + +This workflow runs on the set level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| zip_column_content | **files_to_zip** | Array[File] | The column that has the files you want to zip. | | Required | +| zip_column_content | **zipped_file_name** | String | The name you want your zipped file to have. The .zip file extension will be added to this name. | | Required | +| zip_files | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| zip_files | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| zip_files | **docker_image** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1" | Optional | +| zip_files | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +!!! info + Please note that if you run this workflow on the same Terra set (the same group of samples can be included in multiple Terra sets), the results will overwrite each other. We recommend either (1) renaming the output variable, or (2) creating a new set every time you run the workflow. + +| **Variable** | **Type** | **Description** | +|---|---|---| +| zipped_files | File | The zipped file containing all of the items from the column you selected. | +| zip_column_content_version | String | The version of the repository the workflow is hosted in | +| zip_column_content_analysis_date | String | The date the workflow was run | diff --git a/docs/workflows/data_import/assembly_fetch.md b/docs/workflows/data_import/assembly_fetch.md new file mode 100644 index 000000000..9387d0b2e --- /dev/null +++ b/docs/workflows/data_import/assembly_fetch.md @@ -0,0 +1,113 @@ +# Assembly Fetch + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Data Import](../../workflows_overview/workflows_type.md/#data-import) | [Any taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v1.3.0 | Yes | Sample-level | + +## Assembly_Fetch_PHB + +The `Assembly_Fetch` workflow downloads assemblies from NCBI. This is particularly useful when you need to align reads against a reference genome, for example during a reference-based phylogenetics workflow. This workflow can be run in two ways: + +1. You can provide an accession for the specific assembly that you want to download, and `Assembly_Fetch` will run only the NCBI genome download task to download this assembly, +2. You can provide an assembly, and `Assembly_Fetch` will first use the `ReferenceSeeker` task to first find the closest reference genome in RefSeq to your query assembly and then will run the NCBI genome download task to download that reference assembly. + +!!! tip + + NOTE: If using Assembly_Fetch workflow version 1.3.0 or higher, the call-caching feature of Terra has been DISABLED to ensure that the workflow is run from the beginning and data is downloaded fresh. Call-caching will not be enabled, even if the user checks the box ✅ in the Terra workflow interface. + +### Inputs + +Assembly_Fetch requires the input samplename, and either the accession for a reference genome to download (ncbi_accession) or an assembly that can be used to query RefSeq for the closest reference genome to download (assembly_fasta). + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| reference_fetch | **samplename** | String | Your sample's name | | Required | +| reference_fetch | **assembly_fasta** | File | Assembly FASTA file of your sample | | Optional | +| reference_fetch | **ncbi_accession** | String | NCBI accession passed to the NCBI datasets task to be downloaded. Example: GCF_000006945.2 (Salmonella enterica subsp. enterica, serovar Typhimurium str. LT2 reference genome) | | Optional | +| ncbi_datasets_download_genome_accession | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| ncbi_datasets_download_genome_accession | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| ncbi_datasets_download_genome_accession | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/ncbi-datasets:14.13.2" | Optional | +| ncbi_datasets_download_genome_accession | **include_gbff** | Boolean | set to true if you would like the GenBank Flat File (GBFF) file included in the output. It contains nucleotide sequence, metadata, and annotations. | FALSE | Optional | +| ncbi_datasets_download_genome_accession | **include_gff3** | Boolean | set to true if you would like the Genomic Feature File v3 (GFF3) file included in the output. It contains nucleotide sequence, metadata, and annotations | FALSE | Optional | +| ncbi_datasets_download_genome_accession | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| referenceseeker | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| referenceseeker | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 200 | Optional | +| referenceseeker | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/biocontainers/referenceseeker:1.8.0--pyhdfd78af_0" | Optional | +| referenceseeker | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| referenceseeker | **referenceseeker_ani_threshold** | Float | ANI threshold used to exclude ref genomes when ANI value less than this value. | 0.95 | Optional | +| referenceseeker | **referenceseeker_conserved_dna_threshold** | Float | Conserved DNA threshold used to exclude ref genomes when conserved DNA value is less than this value. | 0.69 | Optional | +| referenceseeker | **referenceseeker_db** | File | Database used by the referenceseeker tool that contains bacterial genomes from RefSeq release 205. Downloaded from referenceseeker GitHub repo. | "gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-bacteria-refseq-205.v20210406.tar.gz" | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Analysis Tasks + +??? task "ReferenceSeeker (optional) Details" + + ##### ReferenceSeeker {#referenceseeker} + + `ReferenceSeeker` uses your draft assembly to identify closely related bacterial, viral, fungal, or plasmid genome assemblies in [RefSeq](https://www.ncbi.nlm.nih.gov/refseq/). + + Databases for use with ReferenceSeeker are as follows, and can be used by pasting the gs uri in double quotation marks `" "` into the `referenceseeker_db` optional input: + + - archea: `gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-archaea-refseq-205.v20210406.tar.gz` + - bacterial (**default**): `gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-bacteria-refseq-205.v20210406.tar.gz` + - fungi: `gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-fungi-refseq-205.v20210406.tar.gz` + - plasmids: `gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-plasmids-refseq-205.v20210406.tar.gz` + - viral: `gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-viral-refseq-205.v20210406.tar.gz` + + For ReferenceSeeker to identify a genome, it must meet user-specified thresholds for sequence coverage (`referenceseeker_conserved_dna_threshold`) and identity (`referenceseeker_ani_threshold`). The default values for these are set according to community standards (conserved DNA >= 69 % and ANI >= 95 %). A list of closely related genomes is provided in `referenceseeker_tsv`. The reference genome that ranks highest according to ANI and conserved DNA values is considered the closest match and will be downloaded, with information about this provided in the `assembly_fetch_referenceseeker_top_hit_ncbi_accession` output. + + !!! techdetails "ReferenceSeeker Technical Details" + + | | Links | + | --- | --- | + | Task | [task_referenceseeker.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_referenceseeker.wdl) | + | Software version | 1.8.0 ("us-docker.pkg.dev/general-theiagen/biocontainers/referenceseeker:1.8.0--pyhdfd78af_0") | + | Software Source Code | https://github.com/oschwengers/referenceseeker | + | Software Documentation | https://github.com/oschwengers/referenceseeker | + | Original Publication(s) | [ReferenceSeeker: rapid determination of appropriate reference genomes](https://joss.theoj.org/papers/10.21105/joss.01994) | + +??? task "NCBI Datasets Details" + + ##### NCBI Datasets {#ncbi-datasets} + + The [`NCBI Datasets`](https://www.ncbi.nlm.nih.gov/datasets/) task downloads specified assemblies from NCBI using either the [virus](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/data-packages/virus-genome/) or [genome](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/data-packages/genome/) (for all other genome types) package as appropriate. + + !!! techdetails "NCBI Datasets Technical Details" + + | | Links | + | --- | --- | + | Task | [task_ncbi_datasets.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_import/task_ncbi_datasets.wdl) | + | Software version | 14.13.2 (us-docker.pkg.dev/general-theiagen/staphb/ncbi-datasets:14.13.2) | + | Software Source Code | https://github.com/ncbi/datasets | + | Software Documentation | https://github.com/ncbi/datasets | + | Original Publication(s) | Not known to be published | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| assembly_fetch_analysis_date | String | Date of assembly download | +| assembly_fetch_ncbi_datasets_assembly_data_report_json | File | JSON file containing report about assembly downloaded by Asembly_Fetch | +| assembly_fetch_ncbi_datasets_assembly_fasta | File | FASTA file downloaded by Assembly_Fetch | +| assembly_fetch_ncbi_datasets_docker | String | Docker file used for NCBI datasets | +| assembly_fetch_ncbi_datasets_gff | File | Assembly downloaded by Assembly_Fetch in GFF3 format | +| assembly_fetch_ncbi_datasets_gff3 | File | Assembly downloaded by Assembly_Fetch in GFF format | +| assembly_fetch_ncbi_datasets_version | String | NCBI datasets version used | +| assembly_fetch_referenceseeker_database | String | ReferenceSeeker database used | +| assembly_fetch_referenceseeker_docker | String | Docker file used for ReferenceSeeker | +| assembly_fetch_referenceseeker_top_hit_ncbi_accession | String | NCBI Accession for the top it identified by Assembly_Fetch | +| assembly_fetch_referenceseeker_tsv | File | TSV file of the top hits between the query genome and the Reference Seeker database | +| assembly_fetch_referenceseeker_version | String | ReferenceSeeker version used | +| assembly_fetch_version | String | The version of the repository the Assembly Fetch workflow is in | + +## References + +> **ReferenceSeeker:** Schwengers O, Hain T, Chakraborty T, Goesmann A. ReferenceSeeker: rapid determination of appropriate reference genomes. J Open Source Softw. 2020 Feb 4;5(46):1994. + +> **NCBI datasets: datasets:** NCBI Datasets is an experimental resource for finding and building datasets [Internet]. Github; [cited 2023 Apr 19]. Available from: diff --git a/docs/workflows/data_import/basespace_fetch.md b/docs/workflows/data_import/basespace_fetch.md new file mode 100644 index 000000000..4fe4072b9 --- /dev/null +++ b/docs/workflows/data_import/basespace_fetch.md @@ -0,0 +1,180 @@ +# BaseSpace_Fetch + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Data Import](../../workflows_overview/workflows_type.md/#data-import) | [Any taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v1.3.0 | Yes | Sample-level | + +## Setting up BaseSpace_Fetch + +The `BaseSpace_Fetch` workflow facilitates the transfer of Illumina sequencing data from BaseSpace (a cloud location) to a workspace on the [Terra.bio](http://Terra.bio) platform. Rather than downloading the files to a local drive and then re-uploading them to another location, we can perform a cloud-to-cloud transfer with the `BaseSpace_Fetch` workflow. + +Some initial set-up is required to use the workflow. To access one's BaseSpace account from within a workflow on Terra.bio, it is necessary to retrieve an access token and the API server address using the BaseSpace command-line tool. The access token is unique to a BaseSpace account. If it is necessary to transfer data from multiple BaseSpace accounts, multiple access tokens will need to be retrieved. Please see the "Retrieving BaseSpace Access Credentials" section below. + +In this document, we provide instructions for both the retrieval of the BaseSpace access token and running the BaseSpace_Fetch workflow. + +### Retrieving BaseSpace Access Credentials + +This process must be performed on a command-line (ideally on a Linux or MacOS computer) before using the `BaseSpace_Fetch` workflow for the first time. This can be set up in Terra, however it will work in any command-line environment that has access to the internet to install & run the BaseSpace command-line tool: `bs`. + +??? toggle "Click for more information" + If you already have a command-line environment available, you can skip ahead to Step 2. + + #### Step 1: Setup Jupyter Cloud Environment + + ??? toggle "Click for more information" + + 1. Select the "Environment configuration" cloud icon on the right side of the workspace dashboard tab + + ![Step1](../../assets/figures/basespace_fetch/step1.png){width="50%"} + + 2. Select the "Settings" button under Jupyter + + ![Step2](../../assets/figures/basespace_fetch/step2.png){width="50%"} + + 3. Click "CREATE" in the "Use default environment section". There is no need to alter the default environment configuration. + + ![Step3](../../assets/figures/basespace_fetch/step3.png){width="50%"} + + + 4. Undertaking steps 1 and 2 again, you will see options to configure the environment. + 1. The default environment (default = GATK 4.1.4.1, Python 3.7..10, R 4.0.5) should be sufficient for retrieval of BaseSpace credentials, but if performing other tasks in the environment please modify the resource allocations appropriately. + 2. You can use up to 4 CPUs which can help move things faster with multithreaded `gsutil` + + ![Step4](../../assets/figures/basespace_fetch/step4.png){width="50%"} + + #### Step 2: Install the BaseSpace Command-Line Tool To Get The Access Token And API Server Address + ??? toggle "Click for more information" + + 1. Open the "Terminal" app in the right side-bar of the Terra dashboard + + ![Step5](../../assets/figures/basespace_fetch/step5.png){width="20%"} + + + 2. Download and setup BaseSpace (BS) CLI (as per Illumina documentation) by following the commands below. The lines beginning with `#` are comments, the following lines are the commands to be copy/pasted into the termina + + ```bash title="BaseSpace Fetch Authentication Instructions" + # create bin dir + mkdir ~/bin + + # download bs cli + wget "https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs" -O $HOME/bin/bs + + # provide proper permissions to bs cli executable + chmod u+x $HOME/bin/bs + + # add the 'bs' command-line tool to the $PATH variable so that you can call the command-line tool from any directory + export PATH="$PATH:$HOME/bin/" + + # authenticate with BaseSpace credentials + bs auth + + # navigate to the link provided in stdout and accept the authentication request through BaseSpace + + # Print the api server and access token to stdout (replace the path below with the appropriate path returned by the find command above) + cat ~/.basespace/default.cfg + ``` + + 3. Copy and paste the contents (**access_token** & **API server**) of the `default.cfg` file into Terra as workspace data elements. + 1. Navigate to the Terra "DATA" tab, and select "Workspace Data" at the bottom left of the page. + 2. You can use the ➕ icon to add the new workspace data elements as in the examples below. + + ![Step6](../../assets/figures/basespace_fetch/step6.png){width="50%"} + +### Preparing to retrieve a run with BaseSpace_Fetch + +??? toggle "Click for more information" + + !!! tip "Best Practices for Sample Identifiers" + - Avoid the use of underscores and whitespaces in the BaseSpace Project/Run name and/or the sample identifiers + - Underscores in a sample name will lead to BaseSpace_Fetch failure + - Avoid re-using Sample IDs. Make all sample IDs unique! + + #### Prepare the metadata spreadsheet for the BaseSpace_Fetch workflow + + 1. Download the sample sheet from BaseSpace. + 1. On the BaseSpace portal, you can navigate to this via: Runs → {run} → Files → SampleSheet.csv + + ![Step7](../../assets/figures/basespace_fetch/step7.png){width="50%"} + + + 2. In Excel, set up a metadata sheet for Terra, with a row for each sample. Please feel free to use our [BaseSpace_Fetch Template](https://storage.googleapis.com/theiagen-public-files/terra/training_resources/bs_fetch_template_20231103.tsv) to help ensure the file is formatted correctly. + 1. In cell A1, enter the data table name with the "**entity:**NAME**_id**" format + 2. Create a column called `basespace_sample_name` and populate this with the data found under the `Sample_Name` column in the BaseSpace sample sheet. + + !!! warning "Watch out" + If the contents of the `Sample_Name` and `Sample_ID` columns in the BaseSpace sample sheet are different, make a `basespace_sample_id` column in your spreadsheet and populate this with the data found under the `Sample_ID` column in the BaseSpace sample sheet. + + 4. Create a `basespace_collection_id` column, and populate it with the BaseSpace Project or Run identifier + 5. Populate column A of the spreadsheet with the sample names + + ![Step8](../../assets/figures/basespace_fetch/step8.png){width="50%"} + + + #### Upload the metadata spreadsheet to the destination workspace in Terra.bio + + 1. In Terra, navigate to the "DATA" tab, click "IMPORT DATA" then "Upload TSV" + + ![Step9](../../assets/figures/basespace_fetch/step9.png){width="25%"} + + + 2. Copy and paste the contents of the whole spreadsheet into the "TEXT IMPORT" tab and click "START IMPORT JOB" + + ![Step10](../../assets/figures/basespace_fetch/step10.png){width="38%"} + +## Using the BaseSpace_Fetch Workflow + +### How to Run BaseSpace_Fetch on Terra + +??? toggle "Click for more information" + + 1. In the Terra "WORKFLOWS" tab, either: + 1. Select the `BaseSpace_Fetch` workflow OR + 2. Import the `BaseSpace_Fetch` workflow from [Dockstore via this link](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/BaseSpace_Fetch_PHB:main?tab=versions). + 2. Set up the `BaseSpace_Fetch` workflow by selecting the: + 1. Version (latest numbered version) from the dropdown menu. + 2. Data table to use. + 3. Samples that you wan to import data for. + 3. Set up the `BaseSpace_Fetch` "INPUTS" form as below. **Don't forget to fill out `this.basespace_sample_id` if your basespace sample IDs are different from the basespace sample names in the SampleSheet.csv file.** + + ![Step11](../../assets/figures/basespace_fetch/step11.png){width="50%"} + + 4. In the "OUTPUTS" tab, select "use defaults", then click "SAVE". + 5. You can now run the workflow and import data for all the samples you have selected. + +### **Inputs** + +!!! info "Call Caching Disabled" + If using BaseSpace_Fetch workflow version 1.3.0 or higher, the call-caching feature of Terra has been DISABLED to ensure that the workflow is run from the beginning and data is downloaded fresh. Call-caching will not be enabled, even if the user checks the box ✅ in the Terra workflow interface. + +!!! warning "Sample_Name _and_ Sample_ID" + If the Sample_Name and Sample_ID in the BaseSpace sample sheet are different, set the `basespace_sample_id` input attribute to "`this.basespace_sample_id"`. + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| basespace_fetch | **access_token** | String | The access token is used in place of a username and password to allow the workflow to access the user account in BaseSpace from which the data is to be transferred. It is an alphanumeric string that is 32 characters in length. Example: 9e08a96471df44579b72abf277e113b7 | | Required | +| basespace_fetch | **api_server** | String | The API server is the web address to which data transfer requests can be sent by the workflow. Use this API server if you are unsure: `"https://api.basespace.illumina.com"` (this is the default set by the command-line tool) | | Required | +| basespace_fetch | **basespace_collection** | String | The collection ID is the BaseSpace Run or Project where the data to be transferred is stored. | | Required | +| basespace_fetch | **basespace_sample_name** | String | The BaseSpace sample name is the sample identifier used in BaseSpace. This identifier is set on the sample sheet at the onset of an Illumina sequencing run. | | Required | +| basespace_fetch | **sample_name** | String | The sample name is the sample identifier used in the Terra.bio data table corresponding to the metadata associated with the sample to be transferred from BaseSpace | | Required | +| basespace_fetch | **basespace_sample_id** | String | The BaseSpace sample ID is an optional additional identifier used in BaseSpace. If a sample has a BaseSpace sample ID it should be available on the sample sheet and must be included in the metadata sheet upload prior to running BaseSpace_Fetch. | | Optional | +| fetch_bs | **cpu** | Int | This input is the number of CPU's used in the data transfer. To facilitate the transfer of many files this runtime parameter may be increased. | 2 | Optional | +| fetch_bs | **disk_size** | Int | The disk size is the amount of storage in GigaBytes (GB) requested for the VM to run the data transfer task. | 100 | Optional | +| fetch_bs | **docker_image** | String | The Docker image used to run BaseSpace_Fetch task. | "us-docker.pkg.dev/general-theiagen/theiagen/basespace_cli:1.2.1" | Optional | +| fetch_bs | **memory** | Int | The memory is the amount of RAM/memory requested for running the data transfer task. | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### **Outputs** + +The outputs of this workflow will be the fastq files imported from BaseSpace into the data table where the sample ID information had originally been uploaded. + +| **Variable** | **Type** | **Description** | +|---|---|---| +| basespace_fetch_analysis_date | String | Date of download | +| basespace_fetch_version | String | Version of the workflow | +| read1 | File | File with forward-facing reads | +| read2 | File | File with reverse-facing read | diff --git a/docs/workflows/data_import/create_terra_table.md b/docs/workflows/data_import/create_terra_table.md new file mode 100644 index 000000000..31c10e024 --- /dev/null +++ b/docs/workflows/data_import/create_terra_table.md @@ -0,0 +1,105 @@ +# Create_Terra_Table + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Data Import](../../workflows_overview/workflows_type.md/#data-import) | [Any taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.2.0 | Yes | Sample-level | + +## Create_Terra_Table_PHB + +The manual creation of Terra tables can be tedious and error-prone. This workflow will automatically create your Terra data table when provided with the location of the files. + +### Inputs + +!!! warning "Default Behavior" + Files with underscores and/or decimals in the sample name are not recognized; please use dashes instead. + + For example, `name.banana.hello_yes_please.fastq.gz` will become "name". This means that `se-test_21.fastq.gz` and `se-test_22.fastq.gz` will not be recognized as separate samples. + + **_This can be changed_** by providing information in the `file_ending` optional input parameter. See below for more information. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| create_terra_table | **assembly_data** | Boolean | Set to true if your data is in FASTA format; set to false if your data is FASTQ format | | Required | +| create_terra_table | **data_location_path** | String | The full path to your data's Google bucket folder location, including the gs://; can be easily copied by right-clicking and copying the link address in the header after navigating to the folder in the "Files" section of the "Data" tab on Terra (see below for example) | | Required | +| create_terra_table | **new_table_name** | String | The name of the new Terra table you want to create | | Required | +| create_terra_table | **paired_end** | Boolean | Set to true if your data is paired-end FASTQ files; set to false if not | | Required | +| create_terra_table | **terra_project** | String | The name of the Terra project where your data table will be created | | Required | +| create_terra_table | **terra_workspace** | String | The name of the Terra workspace where your data table will be created | | Required | +| create_terra_table | **file_ending** | String | Use to provide file ending(s) to determine what should be dropped from the filename to determine the name of the sample (see below for more information) | | Optional | +| make_table | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| make_table | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 25 | Optional | +| make_table | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-06-21" | Optional | +| make_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | + +### Finding the `data_location_path` + +#### Using the Terra data uploader + +??? toggle "Click for more information" + Once you have named your new collection, you will see the collection name directly above where you can drag-and-drop your data files, or on the same line as the Upload button. Right-click the collection name and select "Copy link address." Paste the copied link into the data_location_path variable, remembering to enclose it in quotes. + + !!! info "Note" + If you click "Next" after uploading your files, it will ask for a metadata TSV. You do not have to provide this, and can instead exit the window. Your data will still be uploaded. + + ![Data uploader](../../assets/figures/Create_Terra_Table_example1.png) + +#### Using the "Files" section in the Data tab + +??? toggle "Click for more information" + Navigate to the folder where your data is ("example_upload" in this example) and right-click on the folder name and select "Copy link address." + + If you uploaded data with the Terra data uploader, your collection will be nested in the "uploads" folder. + + ![Data tab](../../assets/figures/Create_Terra_Table_example2.png) + +### How to determine the appropriate `file_ending` for your data + +The `file_ending` should be a substring of your file names that is held in common. See the following examples: + +!!! example "_One or more_ elements in common" + If you have the following files: + + - sample_01_R1.fastq.gz + - sample_01_R2.fastq.gz + - sample_02_R1.fastq.gz + - sample_02_R2.fastq.gz + + The default behavior would result in a single entry in the table called "sample" which is incorrect. You can rectify this by providing an appropriate `file_ending` for your samples. + + In this group, the desired sample names are "sample_01" and "sample_02". For all the files following the desired names, the text contains `_R`. If we provide "_R" as our `file_ending`, then "sample_01" and "sample_02" will appear in our table with the appropriate read files. + +!!! example "_No_ elements in common" + If you have the following files: + + - sample_01_1.fastq.gz + - sample_01_2.fastq.gz + - sample_02_1.fastq.gz + - sample_02_2.fastq.gz + + The default behavior would result in a single entry in the table called "sample" which is incorrect. You can rectify this by providing an appropriate `file_ending` for your samples. + + In this group, the desired sample names are "sample_01" and "sample_02". However, in this example, there is no common text following the sample name. Providing `"_"` would result in the same behavior as default. We can provide _two_ different patterns in the `file_ending` variable: `"_1,_2"` to capture all possible options. By doing this, "sample_01" and "sample_02" will appear in our table with the appropriate read files. + +To include multiple file endings, please separate them with commas, as shown in the "no elements in common" section. + +### Outputs + +Your table will automatically appear in your workspace with the following fields: + +- Sample name (under the `new_table_name`_id column), which will be the section of the file's name before any decimals or underscores (unless `file_ending` is provided) + - By default: + - `sample01.lane2_flowcell3.fastq.gz` will be represented by `sample01` in the table + - `sample02_negativecontrol.fastq.gz` will be represented by `sample02` in the table + - See "How to determine the appropriate `file_ending` for your data" above to learn how to change this default behavior +- Your data in the appropriate columns, dependent on the values of `assembly_data` and `paired_end` + + | table columns | `assembly_data` is true | `paired_end` is true | `assembly_data` **AND** `paired_end` are false | + | --- | --- | --- | --- | + | read1 | ❌ | ✅ | ✅ | + | read2 | ❌ | ✅ | ❌ | + | assembly_fasta | ✅ | ❌ | ❌ | + +- The date of upload under the `upload_date` column +- The name of the workflow under `table_created_by`, to indicate the table was made by the Create_Terra_Table_PHB workflow. diff --git a/docs/workflows/data_import/sra_fetch.md b/docs/workflows/data_import/sra_fetch.md new file mode 100644 index 000000000..0b3b407fb --- /dev/null +++ b/docs/workflows/data_import/sra_fetch.md @@ -0,0 +1,56 @@ +# SRA_Fetch + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Data Import](../../workflows_overview/workflows_type.md/#data-import) | [Any taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.2.0 | Yes | Sample-level | + +## SRA_Fetch_PHB + +The `SRA_Fetch` workflow downloads sequence data from NCBI's Sequence Read Archive (SRA). It requires an SRA run accession then populates the associated read files to a Terra data table. + +Read files associated with the SRA run accession provided as input are copied to a Terra-accessible Google bucket. Hyperlinks to those files are shown in the "read1" and "read2" columns of the Terra data table. + +### Inputs + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| fetch_sra_to_fastq | **sra_accession** | String | SRA, ENA, or DRA accession number | | Required | +| fetch_sra_to_fastq | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| fetch_sra_to_fastq | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| fetch_sra_to_fastq | **docker_image** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0" | Optional | +| fetch_sra_to_fastq | **fastq_dl_options** | String | Additional parameters to pass to fastq_dl from [here](https://github.com/rpetit3/fastq-dl?tab=readme-ov-file#usage) | "--provider sra" | Optional | +| fetch_sra_to_fastq | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | + +The only required input for the SRA_Fetch workflow is an SRA run accession beginning "SRR", an ENA run accession beginning "ERR", or a DRA run accession which beginning "DRR". + +Please see the [NCBI Metadata and Submission Overview](https://www.ncbi.nlm.nih.gov/sra/docs/submitmeta/) for assistance with identifying accessions. Briefly, NCBI-accessioned objects have the following naming scheme: + +| STUDY | SRP# | +| --- | --- | +| SAMPLE | SRS# | +| EXPERIMENT | SRX# | +| RUN | SRR# | + +### Outputs + +Read data are available either with full base quality scores (**SRA Normalized Format**) or with simplified quality scores (**SRA Lite**). The **SRA Normalized Format** includes full, per-base quality scores, whereas **base quality scores** **have been simplified in SRA Lite files.** This means that all quality scores have been artificially set to Q-30 or Q3. More information about these files can be found [here](https://www.ncbi.nlm.nih.gov/sra/docs/sra-data-formats/). + +Given the lack of usefulness of SRA Lite formatted FASTQ files, we try to avoid these by selecting as provided SRA directly (SRA-Lite is more probably to be the file synced to other repositories), but some times downloading these files is unavoidable. To make the user aware of this, a warning column is present that is populated when an SRA-Lite file is detected. + +| **Variable** | **Type** | **Description** | **Production Status** | +|---|---|---|---| +| read1 | File | File containing the forward reads | Always produced | +| read2 | File | File containing the reverse reads (not availablae for single-end or ONT data) | Produced only for paired-end data | +| fastq_dl_date | String | The date of download | Always produced | +| fastq_dl_docker | String | The docker used | Always produced | +| fastq_dl_metadata | File | File containing metadata of the provided accession such as submission_accession, library_selection, instrument_platform, among others | Always produced | +| fastq_dl_version | String | Fastq_dl version used | Always produced | +| fastq_dl_warning | String | This warning field is populated if SRA-Lite files are detected. These files contain all quality encoding as Phred-30 or Phred-3. | Depends on internal workflow logic | + +## References + +> This workflow relies on [fastq-dl](https://github.com/rpetit3/fastq-dl), a very handy bioinformatics tool by Robert A. Petit III diff --git a/docs/workflows/genomic_characterization/freyja.md b/docs/workflows/genomic_characterization/freyja.md new file mode 100644 index 000000000..004013e30 --- /dev/null +++ b/docs/workflows/genomic_characterization/freyja.md @@ -0,0 +1,543 @@ +# Freyja Workflow Series + +!!! dna inline end "Wastewater and more" + The typical use case of Freyja is to **analyze mixed SARS-CoV-2 samples** from a sequencing dataset, most often **wastewater**. + + !!! warning "Default Values" + The defaults included in the Freyja workflows reflect this use case but **can be adjusted for other pathogens**. See the [**Running Freyja on other pathogens**](freyja.md#running-freyja-on-other-pathogens) section for more information. + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Sample-level, Set-level | + +## Freyja Overview + +[Freyja](https://github.com/andersen-lab/Freyja) is a tool for analysing viral mixed sample genomic sequencing data. Developed by Joshua Levy from the [Andersen Lab](https://andersen-lab.com/), it performs two main steps: + +1. Single nucleotide variant (SNV) frequency estimation; +2. Depth-weighted demixing using constrained least absolute deviation regression. + +Additional post-processing steps can produce visualizations of aggregated samples. + +!!! caption "Figure 1: Workflow Diagram for Freyja_FASTQ_PHB workflow" + ##### Figure 1 { #figure1 } + ![**Figure 1: Workflow diagram for Freyja_FASTQ_PHB workflow.**](../../assets/figures/Freyja_FASTQ.png){width=25%} + + Depending on the type of data (Illumina or Oxford Nanopore), the Read QC and Filtering steps, as well as the Read Alignment steps use different software. The user can specify if the barcodes and lineages file should be updated with `freyja update` before running Freyja or if bootstrapping is to be performed with `freyja boot`. + +Four workflows have been created that perform different parts of Freyja: + +- [**Freyja_Update_PHB**](freyja.md#freyja_update) +- [**Freyja_FASTQ_PHB**](freyja.md#freyja_fastq) +- [**Freyja_Plot_PHB**](freyja.md#freyja_plot) +- [**Freyja_Dashboard_PHB**](freyja.md#freyja_dashboard) + +The main workflow is **Freyja_FASTQ_PHB** ([Figure 1](freyja.md#figure1)). Depending on the type of input data (Illumina paired-end, Illumina single-end or ONT), it runs various QC modules before aligning the sample with either BWA (Illumina) or minimap2 (ONT) to the provided reference file, followed by iVar for primer trimming. After the preprocessing is completed, Freyja is run to generate relative lineage abundances (demix) from the sample. Optional bootstrapping may be performed. + +!!! dna "Data Compatability" + + The **Freyja_FASTQ_PHB workflow** is compatible with the following input data types: + + - Ilumina Single-End + - Illumina Paired-End + - Oxford Nanopore + +**Freyja_Update_PHB** will copy the SARS-CoV-2 reference files (`curated_lineages.json` and `usher_barcodes.feather`) from [the source repository](https://github.com/andersen-lab/Freyja/tree/main/freyja/data) to a user-specific Google Cloud Storage (GCP) location (often a [Terra.bio](http://Terra.bio) workspace-associated bucket). These files can then be used as input for the Freyja_FASTQ_PHB workflow. + +Two options are available to visualize the Freyja results: **Freyja_Plot_PHB** and **Freyja_Dashboard_PHB.** Freyja_Plot_PHB aggregates multiple samples using output from Freyja_FASTQ_PHB to generate a plot that shows fractional abundance estimates for all samples. including the option to plot sample collection date information. Alternatively, **Freyja_Dashboard_PHB** aggregates multiple samples using output from Freyja_FASTQ to generate an interactive visualization. This workflow requires an additional input field called viral load, which is the number of viral copies per liter. + +## Freyja_Update_PHB {#freyja_update} + +This workflow will copy the Freyja reference files (`usher_barcodes.feather` and `curated_lineages.json`) to a GCP URI of your choice for usage in Freyja_FASTQ_PHB. + +### Inputs + +We recommend running this workflow with **"Run inputs defined by file paths"** selected since no information from a Terra data table is actually being used. We also recommend turning off call caching so new information is retrieved every time. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| freyja_update | **gcp_uri** | String | The path where you want the Freyja reference files to be stored. Include gs:// at the beginning of the string. Full example with a Terra workspace bucket: "gs://fc-87ddd67a-c674-45a8-9651-f91e3d2f6bb7" | | Required | +| freyja_update_refs | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| freyja_update_refs | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| freyja_update_refs | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" | Optional | +| freyja_update_refs | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| transfer_files | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| transfer_files | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| transfer_files | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1" | Optional | +| transfer_files | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | + +### Outputs + +This workflow does not produce any outputs that appear in a Terra data table. The reference files will appear at the location specified with the `gcp_uri` input variable. + +## Freyja_FASTQ_PHB {#freyja_fastq} + +Freyja measures SNV frequency and sequencing depth at each position in the genome to return an estimate of the true lineage abundances in the sample. The method uses lineage-defining "barcodes" that, for SARS-CoV-2, are derived from the UShER global phylogenetic tree as a base set for demixing. **Freyja_FASTQ_PHB** returns as output a TSV file that includes the lineages present and their corresponding abundances, along with other values. + +The Freyja_FASTQ_PHB workflow is compatible with the multiple input data types: Ilumina Single-End, Illumina Paired-End and Oxford Nanopore. Depending on the type of input data, different input values are used. + +**Table 1:** Freyja_FASTQ_PHB input configuration for different types of input data. + +| Table Columns | Illumina Paired-End | Illumina Single-End | Oxford Nanopore | +| --- | --- | --- | --- | +| **read1** | ✅ | ✅ | ✅ | +| **read2** | ✅ | ❌ | ❌ | +| **ont** | `false` | `false` | `true` | + +### Freyja_FASTQ Inputs + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| freyja_fastq | **primer_bed** | File | The bed file containing the primers used when sequencing was performed | | Required | +| freyja_fastq | **read1** | File | The raw forward-facing FASTQ file (Illumina or ONT) | | Required | +| freyja_fastq | **reference_genome** | File | The reference genome to use; should match the reference used for alignment (Wuhan-Hu-1) | | Required | +| freyja_fastq | **samplename** | String | The name of the sample | | Required | +| bwa | **cpu** | Int | Number of CPUs to allocate to the task | 6 | Optional | +| bwa | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| bwa | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan | Optional | +| bwa | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| freyja | **bootstrap** | Boolean | Perform bootstrapping | FALSE | Optional | +| freyja | **confirmed_only** | Boolean | Include only confirmed SARS-CoV-2 lineages | FALSE | Optional | +| freyja | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| freyja | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| freyja | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" | Optional | +| freyja | **eps** | Float | The minimum lineage abundance cut-off value | 0.001 | Optional | +| freyja | **freyja_lineage_metadata** | File | (found in the optional section, but is required) File containing the lineage metadata; the "curated_lineages.json" file found can be used for this variable. Does not need to be provided if update_db is true. | None | Optional, Required | +| freyja | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| freyja | **number_bootstraps** | Int | The number of bootstraps to perform (only used if bootstrap = true) | 100 | Optional | +| freyja | **update_db** | Boolean | Updates the Freyja reference files (the usher barcodes and lineage metadata files) but will not save them as output (use Freyja_Update for that purpose). If set to true, the `freyja_lineage_metadata` and `freyja_usher_barcodes` files are not required. | FALSE | Optional | +| freyja_fastq | **depth_cutoff** | Int | The minimum coverage depth with which to exclude sites below this value and group identical barcodes | 10 | Optional | +| freyja_fastq | **ont** | Boolean | Indicates if the input data is derived from an ONT instrument. | FALSE | Optional | +| freyja_fastq | **read2** | File | The raw reverse-facing FASTQ file (Illumina only) | | Optional | +| freyja_fastq | **trimmomatic_minlen** | Int | The minimum length cut-off when performing read cleaning | 25 | Optional | +| get_fasta_genome_size | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| get_fasta_genome_size | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 10 | Optional | +| get_fasta_genome_size | **docker** | String | Docker image used for this task. | "us-docker.pkg.dev/general-theiagen/biocontainers/seqkit:2.4.0--h9ee0642_0" | Optional | +| get_fasta_genome_size | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| minimap2 | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| minimap2 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| minimap2 | **docker** | String | Docker image used for this task. | "us-docker.pkg.dev/general-theiagen/staphb/minimap2:2.22" | Optional | +| minimap2 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| minimap2 | **query2** | File | Internal component. Do not modify | None | Do not modify, Optional | +| nanoplot_clean | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| nanoplot_clean | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| nanoplot_clean | **docker** | String | Docker image used for this task. | "us-docker.pkg.dev/general-theiagen/staphb/nanoplot:1.40.0" | Optional | +| nanoplot_clean | **max_length** | Int | Maximum read length for nanoplot | 100000 | Optional | +| nanoplot_clean | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| nanoplot_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| nanoplot_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| nanoplot_raw | **docker** | String | Docker image used for this task. | "us-docker.pkg.dev/general-theiagen/staphb/nanoplot:1.40.0" | Optional | +| nanoplot_raw | **max_length** | Int | Maximum read length for nanoplot | 100000 | Optional | +| nanoplot_raw | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| primer_trim | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| primer_trim | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| primer_trim | **docker** | String | Docker image used for this task. | "us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan" | Optional | +| primer_trim | **keep_noprimer_reads** | Boolean | Include reads with no primers | TRUE | Optional | +| primer_trim | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_pe | **adapters** | File | A FASTA file containing adapter sequence | None | Optional | +| read_QC_trim_pe | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_pe | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken_db input parameter for this to run successfully | FALSE | Optional | +| read_QC_trim_pe | **call_midas** | Boolean | By default this is set to true to run MIDAS; set to false to skip MIDAS | FALSE | Optional | +| read_QC_trim_pe | **fastp_args** | String | Additional arguments to use with fastp | "--detect_adapter_for_pe -g -5 20 -3 20" | Optional | +| read_QC_trim_pe | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| read_QC_trim_pe | **kraken_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional, Sometimes required | +| read_QC_trim_pe | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| read_QC_trim_pe | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_pe | **midas_db** | File | Database to use with MIDAS. Not required as one will be auto-selected when running the MIDAS task. | None | Optional, Sometimes required | +| read_QC_trim_pe | **phix** | File | The file containing the phix sequence to be used during bbduk task | None | Optional | +| read_QC_trim_pe | **read_processing** | String | Options: "trimmomatic" or "fastp" to indicate which read trimming module to use | "trimmomatic" | Optional | +| read_QC_trim_pe | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | fastq_scan | Optional | +| read_QC_trim_pe | **target_organism** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | None | Optional | +| read_QC_trim_pe | **trim_quality_trim_score** | Int | The minimum quality score to keep during trimming | 30 | Optional | +| read_QC_trim_pe | **trim_window_size** | Int | The window size to use during trimming | 4 | Optional | +| read_QC_trim_pe | **trimmomatic_args** | String | Additional command-line arguments to use with trimmomatic | None | Optional | +| read_QC_trim_ont | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken_db input parameter for this to run successfully | FALSE | Optional | +| read_QC_trim_ont | **downsampling_coverage** | Float | The depth to downsample to with Rasusa. Internal component. Do not modify. | 150 | Do not modify, Optional | +| read_QC_trim_ont | **genome_length** | Int | Internal component. Do not modify | None | Do not modify, Optional | +| read_QC_trim_ont | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| read_QC_trim_ont | **kraken_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional | +| read_QC_trim_ont | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| read_QC_trim_ont | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_ont | **max_length** | Int | Internal component, do not modify | | Do not modify, Optional | +| read_QC_trim_ont | **min_length** | Int | Internal component, do not modify | | Do not modify, Optional | +| read_QC_trim_ont | **run_prefix** | String | Internal component, do not modify | | Do not modify, Optional | +| read_QC_trim_ont | **target_organism** | String | This string is searched for in the kraken2 outputs to extract the read percentage | | Optional | +| read_QC_trim_se | **adapters** | File | A FASTA file containing adapter sequence | None | Optional | +| read_QC_trim_se | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_se | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken_db input parameter for this to run successfully | FALSE | Optional | +| read_QC_trim_se | **call_midas** | Boolean | By default this is set to true to run MIDAS; set to false to skip MIDAS | FALSE | Optional | +| read_QC_trim_se | **fastp_args** | String | Additional arguments to use with fastp | "--detect_adapter_for_pe -g -5 20 -3 20" | Optional | +| read_QC_trim_se | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| read_QC_trim_se | **kraken_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional | +| read_QC_trim_se | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| read_QC_trim_se | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_se | **midas_db** | File | Database to use with MIDAS. Not required as one will be auto-selected when running the MIDAS task. | None | Optional, Sometimes required | +| read_QC_trim_se | **phix** | File | The file containing the phix sequence to be used during bbduk task | None | Optional | +| read_QC_trim_se | **read_processing** | String | Options: "trimmomatic" or "fastp" to indicate which read trimming module to use | "trimmomatic" | Optional | +| read_QC_trim_se | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | fastq_scan | Optional | +| read_QC_trim_se | **target_organism** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | None | Optional | +| read_QC_trim_se | **trim_quality_min_score** | Int | The minimum quality score to keep during trimming | 30 | Optional | +| read_QC_trim_se | **trim_window_size** | Int | The window size to use during trimming | 4 | Optional | +| read_QC_trim_se | **trimmomatic_args** | String | Additional command-line arguments to use with trimmomatic | None | Optional | +| sam_to_sorted_bam | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| sam_to_sorted_bam | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| sam_to_sorted_bam | **docker** | String | Docker image used for this task. | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| sam_to_sorted_bam | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Freyja_FASTQ Analysis Tasks + +??? task "`read_QC_trim_pe` Details" + + ##### `read_QC_trim_pe` {#read_QC_trim_pe} + + This task runs a sub-workflow that gathers basic QC information, trimming (either with trimmomatic or fastp), human read scrubbing, and taxonomic identification (Kraken2). Optional parameters do not need to be modified. For information regarding the individual tasks performed during this, please visit the [TheiaCoV documentation](../genomic_characterization/theiacov.md). + + !!! techdetails "Read_QC_Trim_PE Technical Details" + + | | Links | + | --- | --- | + | Task | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl) | + +??? task "`read_QC_trim_se` Details" + + ##### `read_QC_trim_se` {#read_QC_trim_se} + + This task runs a sub-workflow that gathers basic QC information, trimming (either with trimmomatic or fastp), human read scrubbing, and taxonomic identification (Kraken2). Optional parameters do not need to be modified. For information regarding the individual tasks performed during this, please visit the [TheiaCoV documentation](../genomic_characterization/theiacov.md). + + !!! techdetails "Read_QC_Trim_SE Technical Details" + + | | Links | + | --- | --- | + | Task | [wf_read_QC_trim_se.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_se.wdl) | + +??? task "`read_QC_trim_ont` Details" + + ##### `read_QC_trim_ont` {#read_QC_trim_ont} + + This task runs a sub-workflow that gathers basic QC information, trimming (nanoplot), human read scrubbing, and taxonomic identification (Kraken2). Optional parameters do not need to be modified. For information regarding the individual tasks performed during this, please visit the [TheiaCoV documentation](../genomic_characterization/theiacov.md). + + !!! techdetails "Read_QC_Trim_ONT Technical Details" + + | | Links | + | --- | --- | + | Task | [wf_read_QC_trim_ont.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_ont.wdl) | + +??? task "`bwa` Details" + + ##### `bwa` {#bwa} + + This task aligns the cleaned short reads (Illumina) to the reference genome provided by the user. + + !!! techdetails "BWA Technical Details" + + | | Links | + | --- | --- | + | Task | [task_bwa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_bwa.wdl) | + | Software Source Code | https://github.com/lh3/bwa | + | Software Documentation | https://bio-bwa.sourceforge.net/ | + | Original Publication(s) | [Fast and accurate short read alignment with Burrows-Wheeler transform](https://doi.org/10.1093/bioinformatics/btp324) | + +??? task "`minimap2` Details" + + ##### `minimap2` {#minimap2} + + This task aligns the cleaned long reads (Oxford Nanopore) to the reference genome provided by the user. + + !!! techdetails "Minimap2 Technical Details" + + | | Links | + | --- | --- | + | Task | [task_minimap2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_minimap2.wdl) | + | Software Source Code | https://github.com/lh3/minimap2 | + | Software Documentation | https://lh3.github.io/minimap2/ | + | Original Publication(s) | [Minimap2: pairwise alignment for nucleotide sequences](https://doi.org/10.1093/bioinformatics/bty191) | + +??? task "`primer_trim` Details" + + ##### `primer_trim` {#primer_trim} + + This task trims the primer sequences from the aligned bam file with iVar. The optional input, `keep_noprimer_reads`, does not have to be modified. + + !!! techdetails "Primer Trim Technical Details" + + | | Links | + | --- | --- | + | Task | [task_ivar_primer_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_ivar_primer_trim.wdl) | + | Software Source Code | https://github.com/andersen-lab/ivar | + | Software Documentation | https://andersen-lab.github.io/ivar/html/manualpage.html | + | Original Publication(s) | [An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar](https://doi.org/10.1186/s13059-018-1618-7) | + +??? task "`freyja` Details" + + ##### `freyja` {#freyja} + + The Freyja task will call variants and capture sequencing depth information to identify the relative abundance of lineages present. Optionally, if `bootstrap` is set to true, bootstrapping will be performed. After the optional bootstrapping step, the variants are demixed. + + !!! techdetails "Freyja Technical Details" + + | | Links | + | --- | --- | + | Task | [task_freyja_one_sample.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/freyja/task_freyja.wdl) | + | Software Source Code | https://github.com/andersen-lab/Freyja | + | Software Documentation | https://andersen-lab.github.io/Freyja/index.html# | + +### Freyja_FASTQ Outputs + +The main output file used in subsequent Freyja workflows is found under the `freyja_demixed` column. This TSV file takes on the following format: + +| | sample name | +| --- | --- | +| summarized | [('Delta', 0.65), ('Other', 0.25), ('Alpha', 0.1')] | +| lineages | ['B.1.617.2' 'B.1.2' 'AY.6' 'Q.3'] | +| abundances | "[0.5 0.25 0.15 0.1]" | +| resid | 3.14159 | +| coverage | 95.8 | + +- The `summarized` array denotes a sum of all lineage abundances in a particular WHO designation (i.e. B.1.617.2 and AY.6 abundances are summed in the above example), otherwise they are grouped into "Other". +- The `lineage` array lists the identified lineages in descending order +- The `abundances` array contains the corresponding abundances estimates. +- The value of `resid` corresponds to the residual of the weighted least absolute deviation problem used to estimate lineage abundances. +- The `coverage` value provides the 10x coverage estimate (percent of sites with 10 or greater reads) + +!!! tip "Click "Ignore empty outputs"" + When running the Freyja_FASTQ_PHB workflow, it is recommended to select the "Ignore empty outputs" option in the Terra UI. This will hide the output columns that will not be generated for your input data type. + +| **Variable** | **Type** | **Description** | **Input Data Type** | +|---|---|---|---| +| aligned_bai | File | Index companion file to the bam file generated during the consensus assembly process | ONT, PE, SE | +| aligned_bam | File | Primer-trimmed BAM file; generated during consensus assembly process | ONT, PE, SE | +| alignment_method | String | The method used to generate the alignment | ONT, PE, SE | +| bbduk_docker | String | Docker image used to run BBDuk | PE, SE | +| bwa_version | String | Version of BWA used to map read data to the reference genome | PE, SE | +| fastp_html_report | File | The HTML report made with fastp | PE, SE | +| fastp_version | String | Version of fastp software used | PE, SE | +| fastq_scan_num_reads_clean_pairs | String | Number of clean read pairs | PE | +| fastq_scan_num_reads_clean1 | Int | Number of clean forward reads | PE, SE | +| fastq_scan_num_reads_clean2 | Int | Number of clean reverse reads | PE | +| fastq_scan_num_reads_raw_pairs | String | Number of raw read pairs | PE | +| fastq_scan_num_reads_raw1 | Int | Number of raw forward reads | PE, SE | +| fastq_scan_num_reads_raw2 | Int | Number of raw reverse reads | PE | +| fastq_scan_version | String | Version of fastq_scan used for read QC analysis | PE, SE | +| fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | PE, SE | +| fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | PE | +| fastqc_docker | String | Docker container used for fastqc | PE, SE | +| fastqc_num_reads_clean_pairs | String | Number of read pairs after cleaning by fastqc | PE | +| fastqc_num_reads_clean1 | Int | Number of forward reads after cleaning by fastqc | PE, SE | +| fastqc_num_reads_clean2 | Int | Number of reverse reads after cleaning by fastqc | PE | +| fastqc_num_reads_raw_pairs | String | Number of input read pairs by fastqc | PE | +| fastqc_num_reads_raw1 | Int | Number of input forward reads by fastqc | PE, SE | +| fastqc_num_reads_raw2 | Int | Number of input reverse reads by fastqc | PE | +| fastqc_raw1_html | File | Graphical visualization of raw forward read quality from fastqc to open in an internet browser | PE, SE | +| fastqc_raw2_html | File | Graphical visualization of raw reverse read qualityfrom fastqc to open in an internet browser | PE | +| fastqc_version | String | Version of fastqc software used | PE, SE | +| freyja_barcode_file | File | Barcode file used | ONT, PE, SE | +| freyja_barcode_version | String | Name of barcode file used, or the date if update_db is true | ONT, PE, SE | +| freyja_bootstrap_lineages | File | A CSV that contains the 0.025, 0.05, 0.25, 0.5 (median), 0.75, 0.95, and 0.975 percentiles for each lineage | ONT, PE, SE | +| freyja_bootstrap_lineages_pdf | File | A boxplot of the bootstrap lineages CSV file | ONT, PE, SE | +| freyja_bootstrap_summary | File | A CSV that contains the 0.025, 0.05, 0.25, 0.5 (median), 0.75, 0.95, and 0.975 percentiles for each WHO designated VOI/VOC | ONT, PE, SE | +| freyja_bootstrap_summary_pdf | File | A boxplot of the bootstrap summary CSV file | ONT, PE, SE | +| freyja_coverage | Float | Coverage identified by Freyja and parsed from freyja_demixed file | ONT, PE, SE | +| freyja_demixed | File | The main output TSV; see the section directly above this table for an explanation | ONT, PE, SE | +| freyja_depths | File | A TSV listing the depth of every position | ONT, PE, SE | +| freyja_fastq_wf_analysis_date | String | Date of analysis | ONT, PE, SE | +| freyja_fastq_wf_version | String | The version of the Public Health Bioinformatics (PHB) repository used | ONT, PE, SE | +| freyja_lineage_metadata_file | File | Lineage metadata JSON file used. Can be the one provided as input or downloaded by Freyja if update_db is true | ONT, PE, SE | +| freyja_metadata_version | String | Name of lineage metadata file used, or the date if update_db is true | ONT, PE, SE | +| freyja_usher_barcode_file | File | USHER barcode feather file used. Can be the one provided as input or downloaded by Freyja if update_db is true | ONT, PE, SE | +| freyja_variants | File | The TSV file containing the variants identified by Freyja | ONT, PE, SE | +| freyja_version | String | version of Freyja used | ONT, PE, SE | +| ivar_version_primtrim | String | Version of iVar for running the iVar trim command | ONT, PE, SE | +| kraken_human | Float | Percent of human read data detected using the Kraken2 software | ONT, PE, SE | +| kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | ONT, PE, SE | +| kraken_report | File | Full Kraken report | ONT, PE, SE | +| kraken_report_dehosted | File | Full Kraken report after host removal | ONT, PE, SE | +| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | ONT, PE, SE | +| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | ONT, PE, SE | +| kraken_version | String | Version of Kraken software used | ONT, PE, SE | +| minimap2_docker | String | Docker image used to run minimap2 | ONT | +| minimap2_version | String | Version of minimap2 used | ONT | +| nanoplot_html_clean | File | Clean read file | ONT | +| nanoplot_html_raw | File | Raw read file | ONT | +| nanoplot_num_reads_clean1 | Int | Number of clean reads for the forward-facing file | ONT | +| nanoplot_num_reads_raw1 | Int | Number of reads for the forward-facing file | ONT | +| nanoplot_r1_est_coverage_clean | Float | Estimated coverage on the clean reads by nanoplot | ONT | +| nanoplot_r1_est_coverage_raw | Float | Estimated coverage on the raw reads by nanoplot | ONT | +| nanoplot_r1_mean_q_clean | Float | Mean quality score of clean forward reads | ONT | +| nanoplot_r1_mean_q_raw | Float | Mean quality score of raw forward reads | ONT | +| nanoplot_r1_mean_readlength_clean | Float | Mean read length of clean forward reads | ONT | +| nanoplot_r1_mean_readlength_raw | Float | Mean read length of raw forward reads | ONT | +| nanoplot_r1_median_q_clean | Float | Median quality score of clean forward reads | ONT | +| nanoplot_r1_median_q_raw | Float | Median quality score of raw forward reads | ONT | +| nanoplot_r1_median_readlength_clean | Float | Median read length of clean forward reads | ONT | +| nanoplot_r1_median_readlength_raw | Float | Median read length of raw forward reads | ONT | +| nanoplot_r1_n50_clean | Float | N50 of clean forward reads | ONT | +| nanoplot_r1_n50_raw | Float | N50 of raw forward reads | ONT | +| nanoplot_r1_stdev_readlength_clean | Float | Standard deviation read length of clean forward reads | ONT | +| nanoplot_r1_stdev_readlength_raw | Float | Standard deviation read length of raw forward reads | ONT | +| nanoplot_tsv_clean | File | Output TSV file created by nanoplot | ONT | +| nanoplot_tsv_raw | File | Output TSV file created by nanoplot | ONT | +| nanoq_version | String | Version of nanoq used in analysis | ONT | +| primer_bed_name | String | Name of the primer bed file used for primer trimming | ONT, PE, SE | +| primer_trimmed_read_percent | Float | Percentage of read data with primers trimmed as determined by iVar trim | ONT, PE, SE | +| read1_clean | File | Forward read file after quality trimming and adapter removal | ONT, PE, SE | +| read1_dehosted | File | Dehosted forward reads | ONT, PE, SE | +| read2_clean | File | Reverse read file after quality trimming and adapter removal | PE | +| read2_dehosted | File | Dehosted reverse reads | PE | +| samtools_version | String | The version of SAMtools used to sort and index the alignment file | ONT, PE, SE | +| samtools_version_primtrim | String | The version of SAMtools used to create the pileup before running iVar trim | ONT, PE, SE | +| trimmomatic_docker | String | Docker container for Trimmomatic | PE, SE | +| trimmomatic_version | String | The version of Trimmomatic used | PE, SE | + +## Freyja_Plot_PHB {#freyja_plot} + +This workflow visualizes aggregated freyja_demixed output files produced by Freyja_FASTQ in a single plot (pdf format) which provides fractional abundance estimates for all aggregated samples. + +Options exist to provide lineage-specific breakdowns and/or sample collection time information. + +### Freyja_Plot Inputs + +This workflow runs on the set level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| freyja_plot | **freyja_demixed** | Array[File] | An array containing the output files (freyja_demixed) made by Freyja_FASTQ | | Required | +| freyja_plot | **freyja_plot_name** | String | The name of the plot to be produced. Example: "my-freyja-plot" | | Required | +| freyja_plot | **samplename** | Array[String] | An array containing the names of the samples | | Required | +| freyja_plot | **collection_date** | Array[String] | An array containing the collection dates for the sample (YYYY-MM-DD format) | | Optional | +| freyja_plot_task | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| freyja_plot_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| freyja_plot_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22 | Optional | +| freyja_plot_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| freyja_plot_task | **mincov** | Int | The minimum genome coverage used as a cut-off of data to include in the plot | 60 | Optional | +| freyja_plot_task | **plot_day_window** | Int | The width of the rolling average window; only used if plot_time_interval is "D" | 14 | Optional | +| freyja_plot_task | **plot_lineages** | Boolean | If true, will plot a lineage-specific breakdown | FALSE | Optional | +| freyja_plot_task | **plot_time** | Boolean | If true, will plot sample collection time information (requires the collection_date input variable) | FALSE | Optional | +| freyja_plot_task | **plot_time_interval** | String | Options: "MS" for month, "D" for day | MS | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Analysis Tasks + +??? task "`freyja_plot_task` Details" + + ##### `freyja_plot_task` {#freyja_plot_task} + + This task will aggregate multiple samples together, and then creates a plot. Several optional inputs dictate the plot appearance (see each variable's description for more information). + + !!! techdetails "Freyja Plot Technical Details" + + | | Links | + | --- | --- | + | Task | [wf_freyja_plot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/freyja/task_freyja_plot.wdl) | + | Software Source Code | https://github.com/andersen-lab/Freyja | + | Software Documentation | https://github.com/andersen-lab/Freyja | + +### Freyja_Plot Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| **freyja_demixed_aggregate** | File | A TSV file that summarizes the `freyja_demixed` otuputs for all samples | +| **freyja_plot** | File | A PDF of the plot produced by the workflow | +| **freyja_plot_metadata** | File | The metadata used to create the plot | +| **freyja_plot_version** | String | The version of Freyja used | +| **freyja_plot_wf_analysis_date** | String | The date of analysis | +| **freyja_plot_wf_version** | String | The version of the Public Health Bioinformatics (PHB) repository used | + +## Freyja_Dashboard_PHB {#freyja_dashboard} + +This workflow creates a group of interactive visualizations based off of the aggregated freyja_demixed output files produced by Freyja_FASTQ called a "dashboard." Creating this dashboard requires knowing the viral load of your samples (viral copies/L). + +This dashboard is not "live" — that is, you must rerun the workflow every time you want new data to be included in the visualizations. + +### Freyja_Dashboard Inputs + +This workflow runs on the set level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| freyja_dashboard | **collection_date** | Array[String] | An array containing the collection dates for the sample (YYYY-MM-DD format) | | Required | +| freyja_dashboard | **freyja_dashboard_title** | String | The name of the dashboard to be produced. Example: "my-freyja-dashboard" | | Required | +| freyja_dashboard | **freyja_demixed** | Array[File] | An array containing the output files (freyja_demixed) made by Freyja_FASTQ workflow | | Required | +| freyja_dashboard | **samplename** | Array[String] | An array containing the names of the samples | | Required | +| freyja_dashboard | **viral_load** | Array[String] | An array containing the number of viral copies per liter | | Required | +| freyja_dashboard | **dashboard_intro_text** | File | A file containing the text to be contained at the top of the dashboard. | SARS-CoV-2 lineage de-convolution performed by the Freyja workflow (). | Optional | +| freyja_dashboard_task | **config** | File | (found in the optional section, but is required) A yaml file that applies various configurations to the dashboard, such as grouping lineages together, applying colorings, etc. See also . | None | Optional, Required | +| freyja_dashboard_task | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| freyja_dashboard_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22 | Optional | +| freyja_dashboard_task | **headerColor** | String | A hex color code to change the color of the header | | Optional | +| freyja_dashboard_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| freyja_dashboard_task | **mincov** | Float | The minimum genome coverage used as a cut-off of data to include in the dashboard. Default is set to 60 by the freyja command-line tool (not a WDL task default, per se) | None | Optional | +| freyja_dashboard_task | **scale_by_viral_load** | Boolean | If set to true, averages samples taken the same day while taking viral load into account | FALSE | Optional | +| freyja_dashboard_task | **thresh** | Float | The minimum lineage abundance cut-off value | None | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Freyja_Dashboard Tasks + +??? task "`freyja_dashboard_task` Details" + + This task will aggregate multiple samples together, and then create an interactive HTML visualization. Several optional inputs dictate the dashboard appearance (see each variable's description for more information). + + !!! techdetails "Freyja Dashboard Technical Details" + + | | Links | + | --- | --- | + | Task | [wf_freyja_dashboard.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/freyja/task_freyja_dashboard.wdl) | + | Software Source Code | https://github.com/andersen-lab/Freyja | + | Software Documentation | https://github.com/andersen-lab/Freyja | + +### Freyja_Dashboard Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| freyja_dashboard | File | The HTML file of the dashboard created | +| freyja_dashboard_metadata | File | The metadata used to create the dashboard | +| freyja_dashboard_version | String | The version of Freyja used | +| freyja_dashboard_wf_analysis_date | String | The date of analysis | +| freyja_dashboard_wf_version | String | The version of the Public Health Bioinformatics (PHB) repository used | +| freyja_demixed_aggregate | File | A TSV file that summarizes the `freyja_demixed` outputs for all samples | + +## Running Freyja on other pathogens + +The main requirement to run Freyja on other pathogens is **the existence of a barcode file for your pathogen of interest**. Currently, barcodes exist for the following organisms + +- MEASLES +- MPXV +- RSVa +- RSVb + +The appropriate barcode file and reference sequence need to be downloaded and uploaded to your [Terra.bio](http://Terra.bio) workspace. + +!!! warning "Freyja barcodes for other pathogens" + + Data for various pathogens can be found in the following repository: [Freyja Barcodes](https://github.com/gp201/Freyja-barcodes) + + Folders are organized by pathogen, with each subfolder named after the date the barcode was generated, using the format YYYY-MM-DD. Barcode files are named `barcode.csv`, and reference genome files are named `reference.fasta`. + +When running **Freyja_FASTQ_PHB**, the appropriate reference and barcodes file need to be passed as inputs. The first is a required input and will show up at the top of the workflows inputs page on [Terra.bio](http://Terra.bio) ([Figure 2](freyja.md/#figure2)). + +!!! caption "Figure 2: Required input for Freyja_FASTQ_PHB to provide the reference genome to be used by Freyja" + ##### Figure 2 { #figure2 } + ![**Figure 2: Required input for Freyja_FASTQ_PHB to provide the reference genome to be used by Freyja.**](../../assets/figures/Freyja_figure2.png) + +The barcodes file can be passed directly to Freyja by the `freyja_usher_barcodes` optional input ([Figure 3](freyja.md/#figure3)). + +!!! caption "Figure 3: Optional input for Freyja_FASTQ_PHB to provide the barcodes file to be used by Freyja" + ##### Figure 3 {#figure3} + ![**Figure 3: Optional input for Freyja_FASTQ_PHB to provide the barcodes file to be used by Freyja.**](../../assets/figures/Freyja_figure3.png) + +## References + +If you use any of the Freyja workflows, please cite: + +> Karthikeyan, S., Levy, J.I., De Hoff, P. *et al.* Wastewater sequencing reveals early cryptic SARS-CoV-2 variant transmission. *Nature* **609**, 101–108 (2022). + +> Freyja source code can be found at + +> Freyja barcodes (non-SARS-CoV-2): diff --git a/docs/workflows/genomic_characterization/pangolin_update.md b/docs/workflows/genomic_characterization/pangolin_update.md new file mode 100644 index 000000000..b258d2411 --- /dev/null +++ b/docs/workflows/genomic_characterization/pangolin_update.md @@ -0,0 +1,60 @@ +# Pangolin_Update + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral), SARS-Cov-2 | PHB v2.0.0 | Yes | Sample-level | + +## Pangolin_Update_PHB + +The Pangolin_Update workflow re-runs Pangolin updating prior lineage calls from one docker image to meet the lineage calls specified in an alternative docker image. The most common use case for this is updating lineage calls to be up-to-date with the latest Pangolin nomenclature by using the latest available Pangolin docker image ([found here](https://www.notion.so/theiagen/Docker-Image-and-Reference-Materials-for-SARS-CoV-2-Genomic-Characterization-98328c61f5cb4f77975f512b55d09108)). + +### Inputs + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| pangolin_update | **assembly_fasta** | File | SARS-CoV-2 assembly file in FASTA format | | Required | +| pangolin_update | **old_lineage** | String | The Pangolin lineage previously assigned to the sample | | Required | +| pangolin_update | **old_pangolin_assignment_version** | String | Version of the Pangolin software previously used for lineage assignment. | | Required | +| pangolin_update | **old_pangolin_docker** | String | The Pangolin docker image previously used for lineage assignment. | | Required | +| pangolin_update | **old_pangolin_versions** | String | All pangolin software and database versions previously used for lineage assignment. | | Required | +| pangolin_update | **samplename** | String | The name of the sample being analyzed. | | Required | +| pangolin_update | **lineage_log** | File | TSV file detailing previous lineage assignments and software versions for this sample. | | Optional | +| pangolin_update | **new_pangolin_docker** | String | The Pangolin docker image used to update the Pangolin lineage assignments. | | Optional | +| pangolin4 | **analysis_mode** | String | Pangolin inference engine for lineage designations (usher or pangolearn) | None | Optional | +| pangolin4 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| pangolin4 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| pangolin4 | **expanded_lineage** | Boolean | True/False that determines if a lineage should be expanded without aliases (e.g., BA.1 → B.1.1.529.1) | TRUE | Optional | +| pangolin4 | **max_ambig** | Float | Maximum proportion of Ns allowed for Pangolin to attempt assignment | 0.5 | Optional | +| pangolin4 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| pangolin4 | **min_length** | Int | Minimum query length allowed for pangolin to attempt assignment | 10000 | Optional | +| pangolin4 | **pangolin_arguments** | String | Optional arguments for pangolin e.g. "--skip-scorpio" | None | Optional | +| pangolin4 | **skip_designation_cache** | Boolean | True/False that determines if the designation cache should be used | FALSE | Optional | +| pangolin4 | **skip_scorpio** | Boolean | True/False that determines if scorpio should be skipped. | FALSE | Optional | +| pangolin_update_log | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| pangolin_update_log | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| pangolin_update_log | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | +| pangolin_update_log | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| pangolin_update_log | **timezone** | String | | | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| **pango_lineage** | String | Pango lineage as determined by Pangolin | +| **pango_lineage_expanded** | String | Pango lineage without use of aliases; e.g., BA.1 → B.1.1.529.1 | +| **pango_lineage_log** | File | TSV file listing Pangolin lineage assignments and software versions for this sample | +| **pango_lineage_report** | File | Full Pango lineage report generated by Pangolin | +| **pangolin_assignment_version** | String | Version of the Pangolin software (e.g. PANGO or PUSHER) used for lineage assignment | +| **pangolin_conflict** | String | Number of lineage conflicts as determined by Pangolin | +| **pangolin_docker** | String | The Docker container to use for the task | +| **pangolin_notes** | String | Lineage notes as determined by Pangolin | +| **pangolin_update_analysis_date** | String | Date of analysis | +| **pangolin_update_version** | String | Version of the Public Health Bioinformatics (PHB) repository used | +| **pangolin_updates** | String | Result of Pangolin Update (lineage changed versus unchanged) with lineage assignment and date of analysis | +| **pangolin_versions** | String | All Pangolin software and database versions | diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md new file mode 100644 index 000000000..23e512939 --- /dev/null +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -0,0 +1,1149 @@ +# TheiaCoV Workflow Series + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes, some optional features incompatible | Sample-level | + +## TheiaCoV Workflows + +**The TheiaCoV workflows are for the assembly, quality assessment, and characterization of viral genomes.** There are currently five TheiaCoV workflows designed to accommodate different kinds of input data: + +1. Illumina paired-end sequencing (**TheiaCoV_Illumina_PE**) +2. Illumina single-end sequencing (**TheiaCoV_Illumina_SE**) +3. ONT sequencing (**TheiaCoV_ONT**) +4. Genome assemblies (**TheiaCoV_FASTA**) +5. ClearLabs sequencing (**TheiaCoV_ClearLabs**) + +Additionally, the **TheiaCoV_FASTA_Batch** workflow is available to process several hundred SARS-CoV-2 assemblies at the same time. + +--- + +!!! dna inline end "Key Resources" + + [**Reference Materials for SARS-CoV-2**](https://www.notion.so/Docker-Image-and-Reference-Materials-for-SARS-CoV-2-Genomic-Characterization-98328c61f5cb4f77975f512b55d09108?pvs=21) + + [**Reference Materials for Mpox**](https://www.notion.so/Workspace-Reference-Materials-for-MPXV-Genomic-Characterization-a34f355c68c54c0a82e926d4de607bca?pvs=21) + + ??? toggle "HIV Input JSONs" + - [TheiaCoV_Illumina_PE_HIV_v1_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/439f1c74-d91e-4978-b173-3302f878e343/TheiaCoV_Illumina_PE_HIV_v1_2024-04-19.json) + - [TheiaCoV_Illumina_PE_HIV_v2_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/2c7872de-44c8-406d-bbec-fadaacbb0d98/TheiaCoV_Illumina_PE_HIV_v2_2024-04-19.json) + - [TheiaCoV_ONT_HIV_v1_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/9f9a7bd1-2ac4-47fb-967b-4198a45d4a71/TheiaCoV_ONT_HIV_v1_2024-04-19.json) + - [TheiaCoV_ONT_HIV_v2_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/13fdfec0-4a81-460e-948a-be6ad30d022d/TheiaCoV_ONT_HIV_v2_2024-04-19.json) + + ??? toggle "WNV Input JSONs" + - [TheiaCoV_Illumina_PE_WNV_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/6af74d02-9985-428d-897e-e04ebacc42a3/TheiaCoV_Illumina_PE_WNV_2024-04-19.json) + - [TheiaCoV_Illumina_SE_WNV_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/cb8dec19-2563-4070-9ae9-031c089f8b3d/TheiaCoV_Illumina_SE_WNV_2024-04-19.json) + - [TheiaCoV_FASTA_WNV_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/f2059069-5ce1-45e1-ab9e-51925158c0eb/TheiaCoV_FASTA_WNV_2024-04-19.json) + + ??? toggle "Flu Input JSONs" + - [TheiaCoV_Illumina_PE_flu_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/ba326b69-8a2a-4af2-a74f-e710e667f82b/TheiaCoV_Illumina_PE_flu_2024-04-19.json) + - [TheiaCoV_ONT_flu_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/c01c98f5-d00e-4ff2-ad09-6cc3ff1ad3a7/TheiaCoV_ONT_flu_2024-04-19.json) + - [TheiaCoV_FASTA_flu_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/4c7d7a16-2c20-4cbc-9618-231afade9940/TheiaCoV_FASTA_flu_2024-04-19.json) + + ??? toggle "RSV-A Input JSONs" + - [TheiaCoV_Illumina_PE_RSV-B_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/2be20bb8-b733-4f02-a27f-b0cf19d015f8/TheiaCoV_Illumina_PE_RSV-B_2024-04-19.json) + - [TheiaCoV_FASTA_RSV-A_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/ba6a4845-14ee-4664-b9f3-808f76c87d15/TheiaCoV_FASTA_RSV-A_2024-04-19.json) + + ??? toggle "RSV-B Input JSONs" + - [TheiaCoV_Illumina_PE_RSV-A_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/dd1612ff-20c5-4310-9cb3-c07bf9b7e8a1/TheiaCoV_Illumina_PE_RSV-A_2024-04-19.json) + - [TheiaCoV_FASTA_RSV-B_2024-04-19.json](https://prod-files-secure.s3.us-west-2.amazonaws.com/be290196-9090-4f3c-a9ab-fe730ad213e0/160cdfbc-a556-40bc-aa05-84ae69511400/TheiaCoV_FASTA_RSV-B_2024-04-19.json) + +!!! caption "TheiaCoV Workflow Diagram" + ![TheiaCoV Workflow Diagram](../../assets/figures/TheiaCoV.png) + +### Supported Organisms + +These workflows currently support the following organisms: + +- **SARS-CoV-2** (`"sars-cov-2"`, `"SARS-CoV-2"`) - ==_default organism input_== +- **Monkeypox virus** (`"MPXV"`, `"mpox"`, `"monkeypox"`, `"Monkeypox virus"`, `"Mpox"`) +- **Human Immunodeficiency Virus** (`"HIV"`) +- **West Nile Virus** (`"WNV"`, `"wnv"`, `"West Nile virus"`) +- **Influenza** (`"flu"`, `"influenza"`, `"Flu"`, `"Influenza"`) +- **RSV-A** (`"rsv_a"`, `"rsv-a"`, `"RSV-A"`, `"RSV_A"`) +- **RSV-B** (`"rsv_b"`, `"rsv-b"`, `"RSV-B"`, `"RSV_B"`) + +The compatibility of each workflow with each pathogen is shown below: + +| | SARS-CoV-2 | Mpox | HIV | WNV | Influenza | RSV-A | RSV-B | +| --- | --- | --- | --- | --- | --- | --- | --- | +| Illumina_PE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Illumina_SE | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ✅ | +| ClearLabs | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| ONT | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | +| FASTA | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | + +We've provided the following information to help you set up the workflow for each organism in the form of input JSONs. + +### Inputs + +All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) + +!!! dna "" + ??? toggle "TheiaCoV_Illumina_PE Input Read Data" + + The TheiaCoV_Illumina_PE workflow takes in Illumina paired-end read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) before Terra uploads to minimize data upload time. + + By default, the workflow anticipates **2 x 150bp** reads (i.e. the input reads were generated using a 300-cycle sequencing kit). Modifications to the optional parameter for `trim_minlen` may be required to accommodate shorter read data, such as the 2 x 75bp reads generated using a 150-cycle sequencing kit. + + ??? toggle "TheiaCoV_Illumina_SE Input Read Data" + + TheiaCoV_Illumina_SE takes in Illumina single-end reads. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. Theiagen highly recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) before uploading to Terra to minimize data upload time & save on storage costs. + + By default, the workflow anticipates **1 x 35 bp** reads (i.e. the input reads were generated using a 70-cycle sequencing kit). Modifications to the optional parameter for `trim_minlen` may be required to accommodate longer read data. + + ??? toggle "TheiaCoV_ONT Input Read Data" + + The TheiaCoV_ONT workflow takes in base-called ONT read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) before uploading to Terra to minimize data upload time. + + **The ONT sequencing kit and base-calling approach can produce substantial variability in the amount and quality of read data. Genome assemblies produced by the TheiaCoV_ONT workflow must be quality assessed before reporting results.** + + ??? toggle "TheiaCoV_FASTA Input Assembly Data" + + The TheiaCoV_FASTA workflow takes in assembly files in FASTA format. + + ??? toggle "TheiaCoV_ClearLabs Input Read Data" + + The TheiaCoV_ClearLabs workflow takes in read data produced by the Clear Dx platform from ClearLabs. However, many users use the TheiaCoV_FASTA workflow instead of this one due to a few known issues when generating assemblies with this pipeline that are not present when using ClearLabs-generated FASTA files. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** |* | **Organism** | +|---|---|---|---|---|---|---|---| +| theiacov_clearlabs | **primer_bed** | File | The bed file containing the primers used when sequencing was performed | | Required | CL | sars-cov-2 | +| theiacov_clearlabs | **read1** | File | Read data produced by the Clear Dx platform from ClearLabs | | Required | CL | sars-cov-2 | +| theiacov_fasta | **assembly_fasta** | File | Input assembly FASTA file | | Required | FASTA | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| theiacov_fasta | **input_assembly_method** | File | Method used to generate the assembly file | | Required | FASTA | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| theiacov_illumina_pe | **read1** | File | Forward Illumina read in FASTQ file format (compression optional) | | Required | PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| theiacov_illumina_pe | **read2** | File | Reverse Illumina read in FASTQ file format (compression optional) | | Required | PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| theiacov_illumina_se | **read1** | File | Forward Illumina read in FASTQ file format (compression optional) | | Required | SE | MPXV, WNV, sars-cov-2 | +| theiacov_ont | **read1** | File | Demultiplexed ONT read in FASTQ file format (compression optional) | | Required | ONT | HIV, MPXV, WNV, flu, sars-cov-2 | +| workflow name | **samplename** | String | Name of the sample being analyzed | | Required | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **seq_method** | String | The sequencing methodology used to generate the input read data | | Required | FASTA | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| clean_check_reads | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| clean_check_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| clean_check_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/bactopia/gather_samples:2.0.2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| clean_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| consensus | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | CL, ONT | sars-cov-2 | +| consensus | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, ONT | sars-cov-2 | +| consensus | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/artic-ncov2019-epi2me | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| consensus | **medaka_model** | String | In order to obtain the best results, the appropriate model must be set to match the sequencer's basecaller model; this string takes the format of {pore}_{device}_{caller variant}_{caller_version}. See also https://github.com/nanoporetech/medaka?tab=readme-ov-file#models. | r941_min_high_g360 | Optional | CL, ONT | sars-cov-2 | +| consensus | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | CL, ONT | sars-cov-2 | +| consensus_qc | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| consensus_qc | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| consensus_qc | **docker** | String | The Docker container to use for the task | ngolin | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| consensus_qc | **genome_length** | Int | Internal component, do not modify | | Do not modify, Optional | CL, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| consensus_qc | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| fastq_scan_clean_reads | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | CL | sars-cov-2 | +| fastq_scan_clean_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | +| fastq_scan_clean_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | CL | sars-cov-2 | +| fastq_scan_clean_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | CL | sars-cov-2 | +| fastq_scan_clean_reads | **read1_name** | Int | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | +| fastq_scan_raw_reads | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | CL | sars-cov-2 | +| fastq_scan_raw_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | +| fastq_scan_raw_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | CL | sars-cov-2 | +| fastq_scan_raw_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | CL | sars-cov-2 | +| fastq_scan_raw_reads | **read1_name** | Int | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | +| flu_track | **abricate_flu_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | FASTA, ONT, PE | flu | +| flu_track | **abricate_flu_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE | flu | +| flu_track | **abricate_flu_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/abricate:1.0.1-insaflu-220727 | Optional | FASTA, ONT, PE | flu | +| flu_track | **abricate_flu_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | FASTA, ONT, PE | flu | +| flu_track | **abricate_flu_mincov** | Int | Minimum DNA % coverage | 60 | Optional | FASTA, ONT, PE | flu | +| flu_track | **abricate_flu_minid** | Int | Minimum DNA % identity | 70 | Optional | FASTA, ONT, PE | flu | +| flu_track | **antiviral_aa_subs** | String | Additional list of antiviral resistance associated amino acid substitutions of interest to be searched against those called on the sample segments. They take the format of :, e.g. NA:A26V | | Optional | ONT, PE | flu | +| flu_track | **assembly_metrics_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | PE | flu | +| flu_track | **assembly_metrics_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE | flu | +| flu_track | **assembly_metrics_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15 | Optional | PE | flu | +| flu_track | **assembly_metrics_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE | flu | +| flu_track | **flu_h1_ha_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_h1n1_m2_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_h3_ha_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_h3n2_m2_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_n1_na_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_n2_na_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_pa_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_pb1_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_pb2_ref** | File | Internal component, do not modify | | Do not modify, Optional | ONT, PE | flu | +| flu_track | **flu_subtype** | String | The influenza subtype being analyzed. Used for picking nextclade datasets. Options: "Yamagata", "Victoria", "H1N1", "H3N2". Only use to override the subtype call from IRMA and ABRicate. | | Optional | CL, ONT, PE, SE | flu | +| flu_track | **genoflu_cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | FASTA, ONT, PE | flu | +| flu_track | **genoflu_cross_reference** | File | An Excel file to cross-reference BLAST findings; probably useful if novel genotypes are not in the default file used by genoflu.py | | Optional | FASTA, ONT, PE | | +| flu_track | **genoflu_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 25 | Optional | FASTA, ONT, PE | | +| flu_track | **genoflu_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/genoflu:1.03 | Optional | FASTA, ONT, PE | | +| flu_track | **genoflu_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | FASTA, ONT, PE | | +| flu_track | **irma_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT, PE | flu | +| flu_track | **irma_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT, PE | flu | +| flu_track | **irma_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/cdcgov/irma:v1.1.5 | Optional | ONT, PE | flu | +| flu_track | **irma_keep_ref_deletions** | Boolean | True/False variable that determines if sites missed during read gathering should be deleted by ambiguation. | TRUE | Optional | ONT, PE | flu | +| flu_track | **irma_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT, PE | flu | +| flu_track | **nextclade_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT, PE | flu | +| flu_track | **nextclade_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | ONT, PE | flu | +| flu_track | **nextclade_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.3.1 | Optional | ONT, PE | flu | +| flu_track | **nextclade_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | ONT, PE | flu | +| flu_track | **nextclade_output_parser_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| flu_track | **nextclade_output_parser_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| flu_track | **nextclade_output_parser_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/python/python:3.8.18-slim | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| flu_track | **nextclade_output_parser_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| flu_track | **read2** | File | Internal component. Do not use. | | Optional | ONT | flu | +| gene_coverage | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | CL, ONT, PE, SE | MPXV, sars-cov-2 | +| gene_coverage | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, ONT, PE, SE | MPXV, sars-cov-2 | +| gene_coverage | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15 | Optional | CL, ONT, PE, SE | MPXV, sars-cov-2 | +| gene_coverage | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, ONT, PE, SE | MPXV, sars-cov-2 | +| gene_coverage | **min_depth** | Int | The minimum depth to determine if a position was covered. | 10 | Optional | ONT, PE, SE | MPXV, sars-cov-2 | +| gene_coverage | **sc2_s_gene_start** | Int | start nucleotide position of the SARS-CoV-2 Spike gene | 21563 | Optional | CL, ONT, PE, SE | MPXV, sars-cov-2 | +| gene_coverage | **sc2_s_gene_stop** | Int | End/Last nucleotide position of the SARS-CoV-2 Spike gene | 25384 | Optional | CL, ONT, PE, SE | MPXV, sars-cov-2 | +| ivar_consensus | **read2** | File | Internal component, do not modify | | Do not modify, Optional | SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| ivar_consensus | **skip_N** | Boolean | True/False variable that determines if regions with depth less than minimum depth should not be added to the consensus sequence | FALSE | Optional | PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| kraken2_dehosted | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | +| kraken2_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | +| kraken2_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | +| kraken2_raw | **docker_image** | Int | Docker container used in this task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | +| kraken2_raw | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | +| kraken2_raw | **memory** | String | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | +| kraken2_raw | **read_processing** | String | The tool used for trimming of primers from reads. Options are trimmomatic and fastp | trimmomatic | Optional | | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| kraken2_raw | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | +| nanoplot_clean | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_clean | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_clean | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/nanoplot:1.40.0 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_clean | **max_length** | Int | The maximum length of clean reads, for which reads longer than the length specified will be hidden. | 100000 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_clean | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_raw | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/nanoplot:1.40.0 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_raw | **max_length** | Int | The maximum length of clean reads, for which reads longer than the length specified will be hidden. | 100000 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nanoplot_raw | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| ncbi_scrub_se | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | +| ncbi_scrub_se | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | +| ncbi_scrub_se | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/ncbi/sra-human-scrubber:2.2.1 | Optional | CL | sars-cov-2 | +| ncbi_scrub_se | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | +| nextclade_output_parser | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT, PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_output_parser | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | ONT, PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_output_parser | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/python/python:3.8.18-slim | Optional | ONT, PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_output_parser | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **auspice_reference_tree_json** | File | An Auspice JSON phylogenetic reference tree which serves as a target for phylogenetic placement. | Inherited from nextclade dataset | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.3.1 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **gene_annotations_gff** | File | A genome annotation to specify how to translate the nucleotide sequence to proteins (genome_annotation.gff3). specifying this enables codon-informed alignment and protein alignments. See here for more info: https://docs.nextstrain.org/projects/nextclade/en/latest/user/input-files/03-genome-annotation.html | Inherited from nextclade dataset | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **input_ref** | File | A nucleotide sequence which serves as a reference for the pairwise alignment of all input sequences. This is also the sequence which defines the coordinate system of the genome annotation. See here for more info: https://docs.nextstrain.org/projects/nextclade/en/latest/user/input-files/02-reference-sequence.html | Inherited from nextclade dataset | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **nextclade_pathogen_json** | File | General dataset configuration file. See here for more info: https://docs.nextstrain.org/projects/nextclade/en/latest/user/input-files/05-pathogen-config.html | Inherited from nextclade dataset | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| nextclade_v3 | **verbosity** | String | other options are: "off" , "error" , "info" , "debug" , and "trace" (highest level of verbosity) | warn | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| organism_parameters | **auspice_config** | File | Auspice config file used in Augur_PHB workflow.
Defaults set for various organisms & Flu segments. A minimal auspice config file is set in cases where organism is not specified and user does not provide an optional input config file. | | Optional | Augur, CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| organism_parameters | **flu_segment** | String | Influenza genome segment being analyzed. Options: "HA" or "NA". Automatically determined. This input is ignored if provided for TheiaCoV_Illumina_SE and TheiaCoV_ClearLabs | N/A | Optional | CL, ONT, PE, SE | flu | +| organism_parameters | **flu_subtype** | String | The influenza subtype being analyzed. Options: "Yamagata", "Victoria", "H1N1", "H3N2". Automatically determined. This input is ignored if provided for TheiaCoV_Illumina_SE and TheiaCoV_ClearLabs | N/A | Optional | CL, ONT, PE, SE | flu | +| organism_parameters | **gene_locations_bed_file** | File | Use to provide locations of interest where average coverage will be calculated | Default provided for SARS-CoV-2 ("gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed") and mpox ("gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed") | Optional | CL, FASTA | | +| organism_parameters | **genome_length_input** | Int | Use to specify the expected genome length; provided by default for all supported organisms | Default provided for SARS-CoV-2 (29903), mpox (197200), WNV (11000), flu (13000), RSV-A (16000), RSV-B (16000), HIV (primer versions 1 [9181] and 2 [9840]) | Optional | CL | | +| organism_parameters | **hiv_primer_version** | String | The version of HIV primers used. Options are "https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_organism_parameters.wdl#L156" and "https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_organism_parameters.wdl#L164". This input is ignored if provided for TheiaCoV_Illumina_SE and TheiaCoV_ClearLabs | v1 | Optional | CL, FASTA, ONT, PE, SE | HIV | +| organism_parameters | **kraken_target_organism_input** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | Default provided for mpox (Monkeypox virus), WNV (West Nile virus), and HIV (Human immunodeficiency virus 1) | Optional | FASTA, ONT, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| organism_parameters | **pangolin_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 | Optional | CL, FASTA | | +| organism_parameters | **primer_bed_file** | File | The bed file containing the primers used when sequencing was performed | REQUIRED FOR SARS-CoV-2, MPOX, WNV, RSV-A & RSV-B. Provided by default only for HIV primer versions 1 ("gs://theiagen-public-files/terra/hivgc-files/HIV-1_v1.0.primer.hyphen.bed" and 2 ("gs://theiagen-public-files/terra/hivgc-files/HIV-1_v2.0.primer.hyphen400.1.bed") | Optional, Sometimes required | CL, FASTA | | +| organism_parameters | **reference_gff_file** | File | Reference GFF file for the organism being analyzed | Default provided for mpox ("gs://theiagen-public-files/terra/mpxv-files/Mpox-MT903345.1.reference.gff3") and HIV (primer versions 1 ["gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.gff3"] and 2 ["gs://theiagen-public-files/terra/hivgc-files/AY228557.1.gff3"]) | Optional | CL, FASTA, ONT | | +| organism_parameters | **vadr_max_length** | Int | Maximum length for the fasta-trim-terminal-ambigs.pl VADR script | Default provided for SARS-CoV-2 (30000), mpox (210000), WNV (11000), flu (0), RSV-A (15500) and RSV-B (15500). | Optional | CL | | +| organism_parameters | **vadr_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 (RSV-A and RSV-B) and 8 (all other TheiaCoV organisms) | Optional | CL, ONT, PE, SE | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| organism_parameters | **vadr_options** | String | Options for the v-annotate.pl VADR script | Default provided for SARS-CoV-2 ("--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta"), mpox ("--glsearch -s -r --nomisc --mkey mpxv --r_lowsimok --r_lowsimxd 100 --r_lowsimxl 2000 --alt_pass discontn,dupregin --out_allfasta --minimap2 --s_overhang 150"), WNV ("--mkey flavi --mdir /opt/vadr/vadr-models-flavi/ --nomisc --noprotid --out_allfasta"), flu (""), RSV-A ("-r --mkey rsv --xnocomp"), and RSV-B ("-r --mkey rsv --xnocomp") | Optional | CL | | +| organism_parameters | **vadr_skip_length** | Int | Minimum assembly length (unambiguous) to run VADR | 10000 | Optional | CL | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| pangolin4 | **analysis_mode** | String | Pangolin inference engine for lineage designations (usher or pangolearn). Default is Usher. | | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **expanded_lineage** | Boolean | True/False that determines if a lineage should be expanded without aliases (e.g., BA.1 → B.1.1.529.1) | TRUE | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **max_ambig** | Float | The maximum proportion of Ns allowed for pangolin to attempt an assignment | 0.5 | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **min_length** | Int | Minimum query length allowed for pangolin to attempt an assignment | 10000 | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **pangolin_arguments** | String | Optional arguments for pangolin e.g. ''--skip-scorpio'' | | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **skip_designation_cache** | Boolean | A True/False option that determines if the designation cache should be used | FALSE | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| pangolin4 | **skip_scorpio** | Boolean | A True/False option that determines if scorpio should be skipped. | FALSE | Optional | CL, FASTA, ONT, PE, SE | sars-cov-2 | +| qc_check_task | **ani_highest_percent** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **ani_highest_percent_bases_aligned** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **assembly_length** | Int | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **assembly_mean_coverage** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **busco_results** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **combined_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **combined_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **combined_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **combined_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **est_coverage_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **est_coverage_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **gambit_predicted_taxon** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_human** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | +| qc_check_task | **kraken_human_dehosted** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | +| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **midas_secondary_genus_abundance** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **midas_secondary_genus_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **minbaseq_trim** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **n50_value** | Int | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **num_reads_clean2** | Int | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, SE | | +| qc_check_task | **num_reads_raw2** | Int | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, SE | | +| qc_check_task | **number_contigs** | Int | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **quast_gc_percent** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r1_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r1_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r1_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r1_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r2_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r2_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r2_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **r2_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **sc2_s_gene_mean_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **sc2_s_gene_percent_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| quasitools_illumina_pe | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | PE | HIV | +| quasitools_illumina_pe | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | PE | HIV | +| quasitools_illumina_pe | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/quasitools:0.7.0--pyh864c0ab_1 | Optional | PE | HIV | +| quasitools_illumina_pe | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | PE | HIV | +| quasitools_ont | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT | HIV | +| quasitools_ont | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | ONT | HIV | +| quasitools_ont | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/quasitools:0.7.0--pyh864c0ab_1 | Optional | ONT | HIV | +| quasitools_ont | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | ONT | HIV | +| quasitools_ont | **read2** | File | Internal component. Do not use. | | Do not modify, Optional | ONT | HIV | +| raw_check_reads | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| raw_check_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| raw_check_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/bactopia/gather_samples:2.0.2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| raw_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **call_kraken** | Boolean | True/False variable that determines if the Kraken2 task should be called. | FALSE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **call_midas** | Boolean | True/False variable that determines if the MIDAS task should be called. | TRUE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **downsampling_coverage** | Float | The desired coverage to sub-sample the reads to with RASUSA | 150 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **fastp_args** | String | Additional fastp task arguments | --detect_adapter_for_pe -g -5 20 -3 20 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken_db** | File | The database used to run Kraken2 | /kraken2-db | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **midas_db** | File | The database used by the MIDAS task | gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **read_processing** | String | The name of the tool to perform basic read processing; options: "trimmomatic" or "fastp" | trimmomatic | Optional | PE, SE | | +| read_QC_trim | **read_qc** | String | The tool used for quality control (QC) of reads. Options are fastq_scan and fastqc | fastq_scan | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **target_organism** | String | Organism to search for in Kraken | | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **trimmomatic_args** | String | Additional arguments to pass to trimmomatic | -phred33 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| set_flu_ha_nextclade_values | **reference_gff_file** | File | Reference GFF file for flu HA | | Do not modify, Optional | ONT | flu | +| set_flu_na_nextclade_values | **reference_gff_file** | Int | Reference GFF file for flu NA | | Do not modify, Optional | ONT | flu | +| set_flu_na_nextclade_values | **vadr_mem** | Int | Memory, in GB, allocated to this task | 8 | Do not modify, Optional | ONT | flu | +| stats_n_coverage | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | CL, ONT | | +| stats_n_coverage | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, ONT | | +| stats_n_coverage | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15 | Optional | CL, ONT | | +| stats_n_coverage | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, ONT | | +| stats_n_coverage_primtrim | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | CL, ONT | | +| stats_n_coverage_primtrim | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, ONT | | +| stats_n_coverage_primtrim | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15 | Optional | CL, ONT | | +| stats_n_coverage_primtrim | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, ONT | | +| vadr | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | CL, FASTA, ONT, PE, SE | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| vadr | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, FASTA, ONT, PE, SE | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| vadr | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/vadr:1.5.1 | Optional | CL, FASTA, ONT, PE, SE | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| vadr | **max_length** | Int | Maximum length of contig allowed to run VADR | | Optional | CL | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| vadr | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 (RSV-A and RSV-B) and 8 (all other TheiaCoV organisms) | Optional | CL | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| vadr | **min_length** | Int | Minimum length subsequence to possibly replace Ns for the http://fasta-trim-terminal-ambigs.pl/ VADR script | 50 | Optional | CL, FASTA, ONT, PE, SE | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| vadr | **skip_length** | Int | Minimum assembly length (unambiguous) to run VADR | 10000 | Optional | CL | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| vadr | **vadr_opts** | String | Additional options to provide to VADR | | Optional | CL | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | ONT, PE, SE, FASTA, CL | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | ONT, PE, SE, FASTA, CL | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **adapters** | File | File that contains the adapters used | /bbmap/resources/adapters.fa | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **consensus_min_freq** | Float | The minimum frequency for a variant to be called a SNP in consensus genome | 0.6 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **flu_segment** | String | Influenza genome segment being analyzed. Options: "HA" or "NA". | HA | Optional, Required | FASTA | | +| workflow name | **flu_subtype** | String | The influenza subtype being analyzed. Options: "Yamagata", "Victoria", "H1N1", "H3N2". Automatically determined. | | Optional | FASTA | | +| workflow name | **genome_length** | Int | Use to specify the expected genome length | | Optional | FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **max_genome_length** | Int | Maximum genome length able to pass read screening | 2673870 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **max_length** | Int | Maximum length for a read based on the SARS-CoV-2 primer scheme | 700 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **medaka_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/artic-ncov2019:1.3.0-medaka-1.4.3 | Optional | CL | | +| workflow name | **min_basepairs** | Int | Minimum base pairs to pass read screening | 34000 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **min_coverage** | Int | Minimum coverage to pass read screening | 10 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **min_depth** | Int | Minimum depth of reads required to call variants and generate a consensus genome. This value is passed to the iVar software. | 100 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **min_genome_length** | Int | Minimum genome length to pass read screening | 1700 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **min_length** | Int | Minimum length of a read based on the SARS-CoV-2 primer scheme | 400 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **min_proportion** | Int | Minimum read proportion to pass read screening | 40 | Optional | PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **min_reads** | Int | Minimum reads to pass read screening | 113 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **nextclade_dataset_name** | String | Nextclade organism dataset names. However, if organism input is set correctly, this input will be automatically assigned the corresponding dataset name. See [organism defaults](./theiacov.md#org-specific) for more information | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments) here: https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_organism_parameters.wdl | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **nextclade_dataset_tag** | String | Nextclade dataset tag. Used for pulling up-to-date reference genomes and associated information specific to nextclade datasets (QC thresholds, organism-specific information like SARS-CoV-2 clade & lineage information, etc.) that is required for running the Nextclade tool. | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments) here: https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_organism_parameters.wdl | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **normalise** | Int | Used to normalize the amount of reads to the indicated level before variant calling | 20000 for CL, 200 for ONT | Optional | CL, ONT | | +| workflow name | **organism** | String | The organism that is being analyzed. Options: "sars-cov-2", "MPXV", "WNV", "HIV", "flu", "rsv_a", "rsv_b". However, "flu" is not available for TheiaCoV_Illumina_SE | sars-cov-2 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **pangolin_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 | Do not modify, Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **phix** | File | File that contains the phix used | /bbmap/resources/phix174_ill.ref.fa.gz | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **primer_bed** | File | The bed file containing the primers used when sequencing was performed | | Optional | ONT, PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **qc_check_table** | File | A TSV file with optional user input QC values to be compared against the default workflow value | | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **reference_gene_locations_bed** | File | Use to provide locations of interest where average coverage will be calculated | | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **reference_genome** | File | An optional reference genome used for consensus assembly and QC | | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **reference_gff** | File | The general feature format (gff) of the reference genome. | | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **seq_method** | String | The sequencing methodology used to generate the input read data | ILLUMINA | Optional | CL, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **skip_mash** | Boolean | A True/False option that determines if mash should be skipped in the screen task. | FALSE | Optional | ONT, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **skip_screen** | Boolean | A True/False option that determines if the screen task should be skipped. | FALSE | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **target_organism** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | | Optional | CL, ONT, PE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **trim_min_length** | Int | The minimum length of each read after trimming | 75 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **trim_primers** | Boolean | A True/False option that determines if primers should be trimmed. | TRUE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **trim_quality_min_score** | Int | The minimum quality score to keep during trimming | 30 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **trim_window_size** | Int | Specifies window size for trimming (the number of bases to average the quality across) | 4 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **vadr_max_length** | Int | Maximum length of contig allowed to run VADR | | Optional | FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **vadr_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 (RSV-A and RSV-B) and 8 (all other TheiaCoV organisms) | Optional | FASTA, ONT, PE, SE | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **vadr_options** | String | Additional options to provide to VADR | | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **vadr_opts** | String | Additional options to provide to VADR | | Optional | FASTA | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **vadr_skip_length** | Int | Minimum assembly length (unambiguous) to run VADR | 10000 | Optional | FASTA, ONT, PE, SE | MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **variant_min_freq** | Float | Minimum frequency for a variant to be reported in ivar outputs | 0.6 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | + +??? toggle "TheiaCoV_FASTA_Batch_PHB Inputs" + + ##### TheiaCoV_FASTA_Batch Inputs {#theiacov-fasta-batch-inputs} + !!! dna "" + ??? toggle "Input Data" + + The TheiaCoV_FASTA_Batch workflow takes in a set of assembly files in FASTA format. + + | **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | + |---|---|---|---|---|---| + | theiacov_fasta_batch | **assembly_fastas** | Array[File] | Genome assembly files in fasta format. Example: this.sars-cov-2-samples.assembly_fasta | | Required | + | theiacov_fasta_batch | **bucket_name** | String | The GCP bucket for the workspace where the TheiaCoV_FASTA_Batch output files are saved. We recommend using a unique GSURI for the bucket associated with your Terra workspace. The root GSURI is accessible in the Dashboard page of your workspace in the "Cloud Information" section.
Do not include the prefix gs:// in the string
Example: ""fc-c526190d-4332-409b-8086-be7e1af9a0b6/theiacov_fasta_batch-2024-04-15-seq-run-1/ | | Required | + | theiacov_fasta_batch | **project_name** | String | The name of the Terra project where the data can be found. Example: "my-terra-project" | | Required | + | theiacov_fasta_batch | **samplenames** | Array[String] | The names of the samples to be analyzed. Example: this.sars-cov-2-samples.sars-cov-2-sample_id | | Required | + | theiacov_fasta_batch | **table_name** | String | The name of the Terra table where the data can be found. Example: "sars-cov-2-sample" | | Required | + | theiacov_fasta_batch | **workspace_name** | String | The name of the Terra workspace where the data can be found. Example "my-terra-workspace" | | Required | + | cat_files_fasta | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | + | cat_files_fasta | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | + | cat_files_fasta | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | + | cat_files_fasta | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | + | nextclade_v3 | **auspice_reference_tree_json** | File | The phylogenetic reference tree which serves as a target for phylogenetic placement | default is inherited from NextClade dataset | Optional | + | nextclade_v3 | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | + | nextclade_v3 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | + | nextclade_v3 | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.3.1 | Optional | + | nextclade_v3 | **gene_annotations_gff** | File | A genome annotation to specify how to translate the nucleotide sequence to proteins (genome_annotation.gff3). specifying this enables codon-informed alignment and protein alignments. See here for more info: https://docs.nextstrain.org/projects/nextclade/en/latest/user/input-files/03-genome-annotation.html | None | Optional | + | nextclade_v3 | **input_ref** | File | A nucleotide sequence which serves as a reference for the pairwise alignment of all input sequences. This is also the sequence which defines the coordinate system of the genome annotation. See here for more info: https://docs.nextstrain.org/projects/nextclade/en/latest/user/input-files/02-reference-sequence.html | None | Optional | + | nextclade_v3 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | + | nextclade_v3 | **nextclade_pathogen_json** | File | General dataset configuration file. See here for more info: https://docs.nextstrain.org/projects/nextclade/en/latest/user/input-files/05-pathogen-config.html | None | Optional | + | nextclade_v3 | **verbosity** | String | other options are: "off" , "error" , "info" , "debug" , and "trace" (highest level of verbosity) | warn | Optional | + | organism_parameters | **flu_segment** | String | | | Optional | + | organism_parameters | **flu_subtype** | String | | | Optional | + | organism_parameters | **gene_locations_bed_file** | File | | | Optional | + | organism_parameters | **genome_length_input** | Int | | | Optional | + | organism_parameters | **hiv_primer_version** | String | | | Optional | + | organism_parameters | **kraken_target_organism_input** | String | | | Optional | + | organism_parameters | **primer_bed_file** | File | | | Optional | + | organism_parameters | **reference_genome** | File | | | Optional | + | organism_parameters | **reference_gff_file** | File | | | Optional | + | organism_parameters | **vadr_max_length** | Int | | | Optional | + | organism_parameters | **vadr_mem** | Int | | | Optional | + | organism_parameters | **vadr_options** | String | | | Optional | + | pangolin4 | **analysis_mode** | String | Used to switch between usher and pangolearn analysis modes. Only use usher because pangolearn is no longer supported as of Pangolin v4.3 and higher versions. | None | Optional | + | pangolin4 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | + | pangolin4 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | + | pangolin4 | **expanded_lineage** | Boolean | True/False that determines if a lineage should be expanded without aliases (e.g., BA.1 → B.1.1.529.1) | TRUE | Optional | + | pangolin4 | **max_ambig** | Float | The maximum proportion of Ns allowed for pangolin to attempt an assignment | 0.5 | Optional | + | pangolin4 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | + | pangolin4 | **skip_designation_cache** | Boolean | True/False that determines if the designation cache should be used | FALSE | Optional | + | pangolin4 | **skip_scorpio** | Boolean | True/False that determines if scorpio should be skipped. | FALSE | Optional | + | sm_theiacov_fasta_wrangling | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | + | sm_theiacov_fasta_wrangling | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | + | sm_theiacov_fasta_wrangling | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-28-v4 | Optional | + | sm_theiacov_fasta_wrangling | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | + | theiacov_fasta_batch | **nextclade_dataset_name** | String | Nextclade organism dataset name. Options: "nextstrain/sars-cov-2/wuhan-hu-1/orfs" However, if organism input is set correctly, this input will be automatically assigned the corresponding dataset name. | sars-cov-2 | Optional | + | theiacov_fasta_batch | **nextclade_dataset_tag** | String | Nextclade dataset tag. Used for pulling up-to-date reference genomes and associated information specific to nextclade datasets (QC thresholds, organism-specific information like SARS-CoV-2 clade & lineage information, etc.) that is required for running the Nextclade tool. | 2024-06-13--23-42-47Z | Optional | + | theiacov_fasta_batch | **organism** | String | The organism that is being analyzed. Options: "sars-cov-2" | sars-cov-2 | Optional | + | theiacov_fasta_batch | **pangolin_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.27 | Optional | + | version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | + | version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Organism-specific parameters and logic {#org-specific} + +The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflows. This step automatically sets the different parameters needed for each downstream tool to the appropriate value for the user-designated organism (by default, `"sars-cov-2"` is the default organism). + +!!! dna "" + The following tables include the relevant organism-specific parameters; **all of these default values can be overwritten by providing a value for the "Overwrite Variable Name" field**. + + ??? toggle "SARS-CoV-2 Defaults" + | **Overwrite Variable Name** | **Organism** | **Default Value** | + |---|---|---| + | gene_locations_bed_file | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed"` | + | genome_length_input | sars-cov-2 | `29903` | + | nextclade_dataset_name_input | sars-cov-2 | `"nextstrain/sars-cov-2/wuhan-hu-1/orfs"` | + | nextclade_dataset_tag_input | sars-cov-2 | `"2024-07-17--12-57-03Z"` | + | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 "`| + | reference_genome | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta"` | + | vadr_max_length | sars-cov-2 | `30000` | + | vadr_mem | sars-cov-2 | `8` | + | vadr_options | sars-cov-2 | `"--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta"` | + + ??? toggle "Mpox Defaults" + | **Overwrite Variable Name** | **Organism** | **Default Value** | + |---|---|---| + | gene_locations_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed"` | + | genome_length_input | MPXV | `197200` | + | kraken_target_organism_input | MPXV | `"Monkeypox virus"` | + | nextclade_dataset_name_input | MPXV | `"nextstrain/mpox/lineage-b.1"` | + | nextclade_dataset_tag_input | MPXV | `"2024-04-19--07-50-39Z"` | + | primer_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed"` | + | reference_genome | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta"` | + | reference_gff_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/Mpox-MT903345.1.reference.gff3"` | + | vadr_max_length | MPXV | `210000` | + | vadr_mem | MPXV | `8` | + | vadr_options | MPXV | `"--glsearch -s -r --nomisc --mkey mpxv --r_lowsimok --r_lowsimxd 100 --r_lowsimxl 2000 --alt_pass discontn,dupregin --out_allfasta --minimap2 --s_overhang 150"` | + + ??? toggle "WNV Defaults" + | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | + |---|---|---|---| + | genome_length_input | WNV | `11000` | | + | kraken_target_organism_input | WNV | `"West Nile virus`" | | + | nextclade_dataset_name_input | WNV | `"NA"` | TheiaCoV's Nextclade currently does not support WNV | + | nextclade_dataset_tag_input | WNV | `"NA"` | TheiaCoV's Nextclade currently does not support WNV | + | primer_bed_file | WNV | `"gs://theiagen-public-files/terra/theiacov-files/WNV/WNV-L1_primer.bed"` | | + | reference_genome | WNV | `"gs://theiagen-public-files/terra/theiacov-files/WNV/NC_009942.1_wnv_L1.fasta"` | | + | vadr_max_length | WNV | `11000` | | + | vadr_mem | WNV | `8` | | + | vadr_options | WNV | `"--mkey flavi --mdir /opt/vadr/vadr-models-flavi/ --nomisc --noprotid --out_allfasta"` | | + + ??? toggle "Flu Defaults" + | **Overwrite Variable Name** | **Organism** | **Flu Segment** | **Flu Subtype** | **Default Value** | **Notes** | + |---|---|---|---|---|---| + | flu_segment | flu | all | all | N/A | TheiaCoV will attempt to automatically assign a flu segment | + | flu_subtype | flu | all | all | N/A | TheiaCoV will attempt to automatically assign a flu subtype | + | genome_length_input | flu | all | all | `13500` | | + | vadr_max_length | flu | all | all | `13500` | | + | vadr_mem | flu | all | all | `8` | | + | vadr_options | flu | all | all | `"--atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3 --mkey flu"` | | + | nextclade_dataset_name_input | flu | ha | h1n1 | `"nextstrain/flu/h1n1pdm/ha/MW626062"` | | + | nextclade_dataset_tag_input | flu | ha | h1n1 | `"2024-07-03--08-29-55Z"` | | + | reference_genome | flu | ha | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.fasta"` | | + | nextclade_dataset_name_input | flu | ha | h3n2 | `"nextstrain/flu/h3n2/ha/EPI1857216"` | | + | nextclade_dataset_tag_input | flu | ha | h3n2 | `"2024-08-08--05-08-21Z"` | | + | reference_genome | flu | ha | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.fasta"` | | + | nextclade_dataset_name_input | flu | ha | victoria | `"nextstrain/flu/vic/ha/KX058884"` | | + | nextclade_dataset_tag_input | flu | ha | victoria | `"2024-07-03--08-29-55Z"` | | + | reference_genome | flu | ha | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.fasta"` | | + | nextclade_dataset_name_input | flu | ha | yamagata | `"nextstrain/flu/yam/ha/JN993010"` | | + | nextclade_dataset_tag_input | flu | ha | yamagata | `"2024-01-30--16-34-55Z"` | | + | reference_genome | flu | ha | yamagata | `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_ha.fasta"` | | + | nextclade_dataset_name_input | flu | na | h1n1 | `"nextstrain/flu/h1n1pdm/na/MW626056"` | | + | nextclade_dataset_tag_input | flu | na | h1n1 | `"2024-07-03--08-29-55Z"` | | + | reference_genome | flu | na | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.fasta"` | | + | nextclade_dataset_name_input | flu | na | h3n2 | `"nextstrain/flu/h3n2/na/EPI1857215"` | | + | nextclade_dataset_tag_input | flu | na | h3n2 | `"2024-04-19--07-50-39Z"` | | + | reference_genome | flu | na | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.fasta"` | | + | nextclade_dataset_name_input | flu | na | victoria | `"nextstrain/flu/vic/na/CY073894"` | | + | nextclade_dataset_tag_input | flu | na | victoria | `"2024-04-19--07-50-39Z"` | | + | reference_genome | flu | na | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_na.fasta"` | | + | nextclade_dataset_name_input | flu | na | yamagata | `"NA"` | | + | nextclade_dataset_tag_input | flu | na | yamagata | `"NA"` | | + | reference_genome | flu | na | yamagata | `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_na.fasta"` | | + + ??? toggle "RSV-A Defaults" + | **Overwrite Variable Name** | **Organism** | **Default Value** | + |---|---|---| + | genome_length_input | rsv_a | 16000 | + | kraken_target_organism | rsv_a | Respiratory syncytial virus | + | nextclade_dataset_name_input | rsv_a | nextstrain/rsv/a/EPI_ISL_412866 | + | nextclade_dataset_tag_input | rsv_a | 2024-08-01--22-31-31Z | + | reference_genome | rsv_a | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta | + | vadr_max_length | rsv_a | 15500 | + | vadr_mem | rsv_a | 32 | + | vadr_options | rsv_a | -r --mkey rsv --xnocomp | + + ??? toggle "RSV-B Defaults" + | **Overwrite Variable Name** | **Organism** | **Default Value** | + |---|---|---| + | genome_length_input | rsv_b | 16000 | + | kraken_target_organism | rsv_b | "Human orthopneumovirus" | + | nextclade_dataset_name_input | rsv_b | nextstrain/rsv/b/EPI_ISL_1653999 | + | nextclade_dataset_tag_input | rsv_b | "2024-08-01--22-31-31Z" | + | reference_genome | rsv_b | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta | + | vadr_max_length | rsv_b | 15500 | + | vadr_mem | rsv_b | 32 | + | vadr_options | rsv_b | -r --mkey rsv --xnocomp | + + ??? toggle "HIV Defaults" + | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | + |---|---|---|---| + | kraken_target_organism_input | HIV | Human immunodeficiency virus 1 | | + | genome_length_input | HIV-v1 | 9181 | This version of HIV originates from Oregon | + | primer_bed_file | HIV-v1 | gs://theiagen-public-files/terra/hivgc-files/HIV-1_v1.0.primer.hyphen.bed | This version of HIV originates from Oregon | + | reference_genome | HIV-v1 | gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.fasta | This version of HIV originates from Oregon | + | reference_gff_file | HIV-v1 | gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.gff3 | This version of HIV originates from Oregon | + | genome_length_input | HIV-v2 | 9840 | This version of HIV originates from Southern Africa | + | primer_bed_file | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/HIV-1_v2.0.primer.hyphen400.1.bed | This version of HIV originates from Southern Africa | + | reference_genome | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.headerchanged.fasta | This version of HIV originates from Southern Africa | + | reference_gff_file | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.gff3 | This version of HIV originates from Southern Africa | + +### Workflow Tasks + +All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT, and ClearLabs workflows. These undertake read trimming and assembly appropriate to the input data type. TheiaCoV workflows subsequently launch default genome characterization modules for quality assessment, and additional taxa-specific characterization steps. When setting up the workflow, users may choose to use "optional tasks" as additions or alternatives to tasks run in the workflow by default. + +#### Core tasks + +!!! tip "" + These tasks are performed regardless of organism, and perform read trimming and various quality control steps. + +??? task "`versioning`: Version capture for TheiaEuk" + + The `versioning` task captures the workflow version from the GitHub (code repository) version. + + !!! techdetails "Version Capture Technical details" + + | | Links | + | --- | --- | + | Task | [task_versioning.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/task_versioning.wdl) | + +??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation" + + The [`screen`](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow: + + 1. Total number of reads: A sample will fail the read screening task if its total number of reads is less than or equal to `min_reads`. + 2. The proportion of basepairs reads in the forward and reverse read files: A sample will fail the read screening if fewer than `min_proportion` basepairs are in either the reads1 or read2 files. + 3. Number of basepairs: A sample will fail the read screening if there are fewer than `min_basepairs` basepairs + 4. Estimated genome size: A sample will fail the read screening if the estimated genome size is smaller than `min_genome_size` or bigger than `max_genome_size`. + 5. Estimated genome coverage: A sample will fail the read screening if the estimated genome coverage is less than the `min_coverage`. + + Read screening is undertaken on both the raw and cleaned reads. The task may be skipped by setting the `skip_screen` variable to true. + + Default values vary between the PE and SE workflow. The rationale for these default values can be found below. + + | Variable | Rationale | + | --- | --- | + | `skip_screen` | Prevent the read screen from running | + | `skip_screen` | Saving waste of compute resources on insufficient data | + | `min_reads` | Minimum number of base pairs for 10x coverage of the Hepatitis delta (of the *Deltavirus* genus) virus divided by 300 (longest Illumina read length) | + | `min_basepairs` | Greater than 10x coverage of the Hepatitis delta (of the *Deltavirus* genus) virus | + | `min_genome_size` | Based on the Hepatitis delta (of the *Deltavirus* genus) genome- the smallest viral genome as of 2024-04-11 (1,700 bp) | + | `max_genome_size` | Based on the *Pandoravirus salinus* genome, the biggest viral genome, (2,673,870 bp) with 2 Mbp added | + | `min_coverage` | A bare-minimum coverage for genome characterization. Higher coverage would be required for high-quality phylogenetics. | + | `min_proportion` | Greater than 50% reads are in the read1 file; others are in the read2 file | + + !!! techdetails "Screen Technical Details" + + There is a single WDL task for read screening. The `screen` task is run twice, once for raw reads and once for clean reads. + + | | Links | + | --- | --- | + | Task | [task_screen.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_screen.wdl) | + +??? task "`read_QC_trim_pe` and `read_QC_trim_se`: Read Quality Trimming, Host and Adapter Removal, Quantification, and Identification ==_for Illumina workflows_==" + + `read_QC_trim` is a sub-workflow within TheiaCoV that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. The differences between TheiaCoV PE and SE in the `read_QC_trim` sub-workflow lie in the default parameters, the use of two or one input read file(s), and the different output files. + + ??? toggle "Host removal" + + All reads of human origin **are removed**, including their mates, by using NCBI's [**human read removal tool (HRRT)**](https://github.com/ncbi/sra-human-scrubber). + + HRRT is based on the [SRA Taxonomy Analysis Tool](https://doi.org/10.1186/s13059-021-02490-0) and employs a k-mer database constructed of k-mers from Eukaryota derived from all human RefSeq records with any k-mers found in non-Eukaryota RefSeq records subtracted from the database. + + !!! techdetails "NCBI-Scrub Technical Details" + + | | Links | + | --- | --- | + | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_ncbi_scrub.wdl) | + | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber) | + | Software Documentation | | + + ??? toggle "Read quality trimming" + + Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_minlen`. + + If fastp is selected for analysis, fastp also implements the additional read-trimming steps indicated below: + + | **Parameter** | **Explanation** | + | --- | --- | + | -g | enables polyG tail trimming | + | -5 20 | enables read end-trimming | + | -3 20 | enables read end-trimming | + | --detect_adapter_for_pe | enables adapter-trimming **only for paired-end reads** | + + ??? toggle "Adapter removal" + + The `BBDuk` task removes adapters from sequence reads. To do this: + + - [Repair](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/) from the [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) package reorders reads in paired fastq files to ensure the forward and reverse reads of a pair are in the same position in the two fastq files. + - [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) (*"Bestus Bioinformaticus" Decontamination Using Kmers*) is then used to trim the adapters and filter out all reads that have a 31-mer match to [PhiX](https://emea.illumina.com/products/by-type/sequencing-kits/cluster-gen-sequencing-reagents/phix-control-v3.html), which is commonly added to Illumina sequencing runs to monitor and/or improve overall run quality. + + ??? toggle "What are adapters and why do they need to be removed?" + Adapters are manufactured oligonucleotide sequences attached to DNA fragments during the library preparation process. In Illumina sequencing, these adapter sequences are required for attaching reads to flow cells. You can read more about Illumina adapters [here](https://emea.support.illumina.com/bulletins/2020/06/illumina-adapter-portfolio.html). For genome analysis, it's important to remove these sequences since they're not actually from your sample. If you don't remove them, the downstream analysis may be affected. + + ??? toggle "Read Quantification" + + There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In TheiaProk_Illumina_PE, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality. + + ??? toggle "Read Identification" + + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. + + Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow + + !!! info "Database-dependent" + TheiaCoV automatically uses a viral-specific Kraken2 database. + + !!! techdetails "Kraken2 Technical Details" + + | | Links | + | --- | --- | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | + | Software Documentation | | + | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + + !!! techdetails "read_QC_trim Technical Details" + + | | Links | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/workflows/wf_read_QC_trim.wdl) | + | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| + | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | + | Original Publication(s) | *[Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
*[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
*[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
*[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +??? task "`read_QC_trim_ONT`: Read Quality Trimming, Host Removal, and Identification ==_for ONT data_==" + + `read_QC_trim` is a sub-workflow within TheiaCoV that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. + + ??? toggle "Host removal" + + All reads of human origin **are removed**, including their mates, by using NCBI's [**human read removal tool (HRRT)**](https://github.com/ncbi/sra-human-scrubber). + + HRRT is based on the [SRA Taxonomy Analysis Tool](https://doi.org/10.1186/s13059-021-02490-0) and employs a k-mer database constructed of k-mers from Eukaryota derived from all human RefSeq records with any k-mers found in non-Eukaryota RefSeq records subtracted from the database. + + !!! techdetails "NCBI-Scrub Technical Details" + + | | Links | + | --- | --- | + | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_ncbi_scrub.wdl) | + | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber) | + | Software Documentation | | + + ??? toggle "Read quality filtering" + + Read filtering is performed using `artic guppyplex` which performs a quality check by filtering the reads by length to remove chimeric reads. + + ??? toggle "Read Identification" + + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. + + Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow + + !!! info "Database-dependent" + TheiaCoV automatically uses a viral-specific Kraken2 database. + + !!! techdetails "Kraken2 Technical Details" + + | | Links | + | --- | --- | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | + | Software Documentation | | + | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + + !!! techdetails "read_QC_trim Technical Details" + + Each TheiaCoV workflow calls a sub-workflow listed below, which then calls the individual tasks: + + | Workflow | TheiaCoV_ONT | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim_ont.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_ont.wdl) | + | Tasks | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_ncbi_scrub.wdl#L68) (SE subtask)
[task_artic_guppyplex.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_artic_guppyplex.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl#L3)| + | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber)
[Artic on GitHub](https://github.com/artic-network/fieldbioinformatics)
[Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | + | Software Documentation | [NCBI Scrub]()
[Artic pipeline](https://artic.readthedocs.io/en/latest/?badge=latest)
[Kraken2](https://github.com/DerrickWood/kraken2/wiki) | + | Original Publication(s) | [*STAT: a fast, scalable, MinHash-based *k*-mer tool to assess Sequence Read Archive next-generation sequence submissions](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02490-0)
*[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +#### Assembly tasks + +!!! tip "" + Either one of these tasks is run depending on the organism and workflow type. + +??? toggle "`ivar_consensus`: Alignment, Consensus, Variant Detection, and Assembly Statistics ==_for non-flu organisms in Illumina workflows_==" + + `ivar_consensus` is a sub-workflow within TheiaCoV that performs reference-based consensus assembly using the [iVar](https://andersen-lab.github.io/ivar/html/index.html) tool by Nathan Grubaugh from the Andersen lab. + + The following steps are performed as part of this sub-workflow: + + 1. Cleaned reads are aligned to the appropriate reference genome (see also the [*organism-specific parameters and logic*](./theiacov.md#org-specific) section above) using [BWA](http://bio-bwa.sourceforge.net/) to generate a Binary Alignment Mapping (BAM) file. + 2. If `trim_primers` is set to true, primers will be removed using `ivar trim`. + 1. General statistics about the remaining reads are calculated. + 3. The `ivar consensus` command is run to generate a consensus assembly. + 4. General statistics about the assembly are calculated.. + + !!! techdetails "iVar Consensus Technical Details" + | Workflow | TheiaCoV_Illumina_PE & TheiaCoV_Illumina_SE | + | --- | --- | + | Sub-workflow | [wf_ivar_consensus.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_ivar_consensus.wdl) | + | Tasks | [task_bwa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_bwa.wdl)
[task_ivar_primer_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_ivar_primer_trim.wdl)
[task_assembly_metrics.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl)
[task_ivar_variant_call.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_ivar_variant_call.wdl)
[task_ivar_consensus.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_ivar_consensus.wdl) | + | Software Source Code | [BWA on GitHub](https://github.com/lh3/bwa), [iVar on GitHub](https://andersen-lab.github.io/ivar/html/) | + | Software Documentation | [BWA on SourceForge](https://bio-bwa.sourceforge.net/), [iVar on GitHub](https://andersen-lab.github.io/ivar/html/) | + | Original Publication(s) | [*Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM](https://doi.org/10.48550/arXiv.1303.3997)
[*An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar](http://dx.doi.org/10.1186/s13059-018-1618-7) | + +??? toggle "`artic_consensus`: Alignment, Primer Trimming, Variant Detection, and Consensus ==_for non-flu organisms in ONT & ClearLabs workflows_==" + + Briefly, input reads are aligned to the appropriate reference with [minimap2](https://github.com/lh3/minimap2) to generate a Binary Alignment Mapping ([BAM](https://en.wikipedia.org/wiki/Binary_Alignment_Map)) file. Primer sequences are then removed from the BAM file and a consensus assembly file is generated using the [Artic minion](https://artic.readthedocs.io/en/latest/commands/#basecaller) Medaka argument. + + !!! info "" + Read-trimming is performed on raw read data generated on the ClearLabs instrument and thus not a required step in the TheiaCoV_ClearLabs workflow. + + General statistics about the assembly are generated with the `consensus_qc` task ([task_assembly_metrics.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl)). + + !!! techdetails "Artic Consensus Technical Details" + | | Links | + | --- | --- | + | Task | [task_artic_consensus.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_artic_consensus.wdl) | + | Software Source Code | [Artic on GitHub](https://github.com/artic-network/fieldbioinformatics) | + | Software Documentation | [Artic pipeline](https://artic.readthedocs.io/en/latest/?badge=latest) | + +??? toggle "`irma`: Assembly and Characterization ==_for flu in TheiaCoV_Illumina_PE & TheiaCoV_ONT_==" + + Cleaned reads are assembled using `irma` which does not use a reference due to the rapid evolution and high variability of influenza. `irma` also performs typing and subtyping as part of the assembly process. + + General statistics about the assembly are generated with the `consensus_qc` task ([task_assembly_metrics.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl)). + + !!! techdetails "IRMA Technical Details" + | | Links | + | --- | --- | + | Task | [task_irma.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_irma.wdl) | + | Software Documentation | [IRMA website](https://wonder.cdc.gov/amd/flu/irma/) | + | Original Publication(s) | [*Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3030-6) | + +#### Organism-specific characterization tasks {#org-specific-tasks} + +!!! dna "" + + The following tasks only run for the appropriate organism designation. The following table illustrates which characterization tools are run for the indicated organism. + + | | SARS-CoV-2 | MPXV | HIV | WNV | Influenza | RSV-A | RSV-B | + | --- | --- | --- | --- | --- | --- | --- | --- | + | Pangolin | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | + | Nextclade | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | + | VADR | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | + | Quasitools HyDRA | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | + | IRMA | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + | Abricate | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + | % Gene Coverage | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | + | Antiviral Detection | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + | GenoFLU | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + +??? task "`pangolin`" + + Pangolin designates SARS-CoV-2 lineage assignments. + + !!! techdetails "Pangolin Technical Details" + + | | Links | + | --- | --- | + | Task | [task_pangolin.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/betacoronavirus/task_pangolin.wdl) | + | Software Source Code | [Pangolin on GitHub](https://github.com/cov-lineages/pangolin) | + | Software Documentation | [Pangolin website](https://cov-lineages.org/resources/pangolin.html) | + +??? task "`nextclade`" + + ["Nextclade is an open-source project for viral genome alignment, mutation calling, clade assignment, quality checks and phylogenetic placement."](https://docs.nextstrain.org/projects/nextclade/en/stable/) + + !!! techdetails "Nextclade Technical Details" + + | | Links | + | --- | --- | + | Task | [task_nextclade.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_nextclade.wdl#L63) | + | Software Source Code | | + | Software Documentation | [Nextclade](https://docs.nextstrain.org/projects/nextclade/en/stable/) | + | Original Publication(s) | [Nextclade: clade assignment, mutation calling and quality control for viral genomes.](https://doi.org/10.21105/joss.03773) | + +??? task "`vadr`" + + VADR annotates and validates completed assembly files. + + !!! techdetails "VADR Technical Details" + + | | Links | + | --- | --- | + | Task | [task_vadr.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/advanced_metrics/task_vadr.wdl) | + | Software Source Code | | + | Software Documentation | | + | Original Publication(s) | For SARS-CoV-2: *[Faster SARS-CoV-2 sequence validation and annotation for GenBank using VADR](https://doi.org/10.1093/nargab/lqad002)*
For non-SARS_CoV-2: [*VADR: validation and annotation of virus sequence submissions to GenBank*](https://doi.org/10.1186/s12859-020-3537-3) | + +??? task "`quasitools`" + + `quasitools` performs genome characterization for HIV. + + !!! techdetails "Quasitools Technical Details" + + | | Links | + | --- | --- | + | Task | [task_quasitools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/lentivirus/task_quasitools.wdl) | + | Software Source Code | | + | Software Documentation | [Quasitools HyDRA](https://phac-nml.github.io/quasitools/hydra/) | + +??? task "`irma`" + + IRMA assigns types and subtype/lineages in addition to performing assembly of flu genomes. Please see the section above under "Assembly tasks" to find more information regarding this tool. + + !!! techdetails "IRMA Technical Details" + | | Links | + | --- | --- | + | Task | [task_irma.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_irma.wdl) | + | Software Documentation | [IRMA website](https://wonder.cdc.gov/amd/flu/irma/) | + | Original Publication(s) | [*Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3030-6) | + +??? task "`abricate`" + + Abricate assigns types and subtype/lineages for flu samples + + !!! techdetails "Abricate Technical Details" + | | Links | + | --- | --- | + | Task | [task_abricate.wdl (abricate_flu subtask)](https://github.com/theiagen/public_health_bioinformatics/blob/2dff853defc6ea540a058873f6fe6a78cc2350c7/tasks/gene_typing/drug_resistance/task_abricate.wdl#L59) | + | Software Source Code | [ABRicate on GitHub](https://github.com/tseemann/abricate) | + | Software Documentation | [ABRicate on GitHub](https://github.com/tseemann/abricate) | + +??? task "`gene_coverage`" + + This task calculates the percent of the gene covered above a minimum depth. By default, it runs for SARS-CoV-2 and MPXV, but if a bed file is provided with regions of interest, this task will be run for other organisms as well. + + !!! techdetails "Gene Coverage Technical Details" + | | Links | + | --- | --- | + | Task | [task_gene_coverage.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_gene_coverage.wdl) | + +??? task "`flu_antiviral_substitutions`" + + This sub-workflow determines which, if any, antiviral mutations are present in the sample. + + The assembled HA, NA, PA, PB1 and PB2 segments are compared against [a list of known amino-acid substitutions associated with resistance](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/task_flu_antiviral_subs.wdl) to the antivirals A_315675, compound_367, Favipiravir, Fludase, L_742_001, Laninamivir, Oseltamivir (tamiflu), Peramivir, Pimodivir, Xofluza, and Zanamivir. The list of known antiviral amino acid substitutions can be expanded via optional user input `antiviral_aa_subs` in the format "`NA:V95A,HA:I97V`", i.e. `Protein:AAPositionAA`. + + !!! techdetails "Antiviral Substitutions Technical Details" + | | Links | + | --- | --- | + | Workflow | [wf_influenza_antiviral_substitutions.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_influenza_antiviral_substitutions.wdl) | + +??? task "`genoflu`" + + This sub-workflow determines the whole-genome genotype of an H5N1 flu sample. + + !!! techdetails "GenoFLU Technical Details" + + | | Links | + | --- | --- | + | Task | [task_genoflu.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/orthomyxoviridae/task_genoflu.wdl) | + | Software Source Code | [GenoFLU on GitHub](https://github.com/USDA-VS/GenoFLU) | + +### Outputs + +All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) + +| **Variable** | **Type** | **Description** | **Workflow** | +|---|---|---|---| +| abricate_flu_database | String | ABRicate database used for analysis | FASTA, ONT, PE | +| abricate_flu_results | File | File containing all results from ABRicate | FASTA, ONT, PE | +| abricate_flu_subtype | String | Flu subtype as determined by ABRicate | FASTA, ONT, PE | +| abricate_flu_type | String | Flu type as determined by ABRicate | FASTA, ONT, PE | +| abricate_flu_version | String | Version of ABRicate | FASTA, ONT, PE | +| aligned_bai | File | Index companion file to the bam file generated during the consensus assembly process | CL, ONT, PE, SE | +| aligned_bam | File | Primer-trimmed BAM file; generated during consensus assembly process | CL, ONT, PE, SE | +| artic_docker | String | Docker image utilized for read trimming and consensus genome assembly | CL, ONT | +| artic_version | String | Version of the Artic software utilized for read trimming and conesnsus genome assembly | CL, ONT | +| assembly_fasta | File | Consensus genome assembly; for lower quality flu samples, the output may state "Assembly could not be generated" when there is too little and/or too low quality data for IRMA to produce an assembly | CL, ONT, PE, SE | +| assembly_length_unambiguous | Int | Number of unambiguous basecalls within the consensus assembly | CL, FASTA, ONT, PE, SE | +| assembly_mean_coverage | Float | Mean sequencing depth throughout the consensus assembly. Generated after performing primer trimming and calculated using the SAMtools coverage command | CL, ONT, PE, SE | +| assembly_method | String | Method employed to generate consensus assembly | CL, FASTA, ONT, PE, SE | +| auspice_json | File | Auspice-compatable JSON output generated from Nextclade analysis that includes the Nextclade default samples for clade-typing and the single sample placed on this tree | CL, FASTA, ONT, PE, SE | +| auspice_json_flu_ha | File | Auspice-compatable JSON output generated from Nextclade analysis on Influenza HA segment that includes the Nextclade default samples for clade-typing and the single sample placed on this tree | ONT, PE | +| auspice_json_flu_na | File | Auspice-compatable JSON output generated from Nextclade analysis on Influenza NA segment that includes the Nextclade default samples for clade-typing and the single sample placed on this tree | ONT, PE | +| bbduk_docker | String | Docker image used to run BBDuk | PE, SE | +| bwa_version | String | Version of BWA used to map read data to the reference genome | PE, SE | +| consensus_flagstat | File | Output from the SAMtools flagstat command to assess quality of the alignment file (BAM) | CL, ONT, PE, SE | +| consensus_n_variant_min_depth | Int | Minimum read depth to call variants for iVar consensus and iVar variants | PE, SE | +| consensus_stats | File | Output from the SAMtools stats command to assess quality of the alignment file (BAM) | CL, ONT, PE, SE | +| est_coverage_clean | Float | Estimated coverage of the clean reads | ONT | +| est_coverage_raw | Float | Estimated coverage of the raw reads | ONT | +| est_percent_gene_coverage_tsv | File | Percent coverage for each gene in the organism being analyzed (depending on the organism input) | CL, ONT, PE, SE | +| fastp_html_report | File | HTML report for fastp | PE, SE | +| fastp_version | String | Fastp version used | PE, SE | +| fastq_scan_num_reads_clean_pairs | String | Number of paired reads after filtering as determined by fastq_scan | PE | +| fastq_scan_num_reads_clean1 | Int | Number of forward reads after filtering as determined by fastq_scan | CL, PE, SE | +| fastq_scan_num_reads_clean2 | Int | Number of reverse reads after filtering as determined by fastq_scan | PE | +| fastq_scan_num_reads_raw_pairs | String | Number of paired reads identified in the input fastq files as determined by fastq_scan | PE | +| fastq_scan_num_reads_raw1 | Int | Number of forward reads identified in the input fastq files as determined by fastq_scan | CL, PE, SE | +| fastq_scan_num_reads_raw2 | Int | Number of reverse reads identified in the input fastq files as determined by fastq_scan | PE | +| fastq_scan_r1_mean_q_clean | Float | Forward read mean quality value after quality trimming and adapter removal | | +| fastq_scan_r1_mean_q_raw | Float | Forward read mean quality value before quality trimming and adapter removal | | +| fastq_scan_r1_mean_readlength_clean | Float | Forward read mean read length value after quality trimming and adapter removal | | +| fastq_scan_r1_mean_readlength_raw | Float | Forward read mean read length value before quality trimming and adapter removal | | +| fastq_scan_version | String | Version of fastq_scan used for read QC analysis | CL, PE, SE | +| fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | PE, SE | +| fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | PE | +| fastqc_docker | String | Docker container used for fastqc | PE, SE | +| fastqc_num_reads_clean_pairs | String | Number of read pairs after cleaning by fastqc | PE | +| fastqc_num_reads_clean1 | Int | Number of forward reads after cleaning by fastqc | PE, SE | +| fastqc_num_reads_clean2 | Int | Number of reverse reads after cleaning by fastqc | PE | +| fastqc_num_reads_raw_pairs | Int | Number of raw read pairs as computed by fastqc | PE | +| fastqc_num_reads_raw1 | Int | Number of raw forward/facing reads as computed by fastqc | PE, SE | +| fastqc_num_reads_raw2 | Int | Number of raw reverse-facing reads as computed by fastqc | PE | +| fastqc_raw1_html | File | Graphical visualization of raw forward read quality from fastqc to open in an internet browser | PE, SE | +| fastqc_raw2_html | File | Graphical visualization of raw reverse read quality from fastqc to open in an internet browser | PE | +| fastqc_version | String | Version of fastqc software used | PE, SE | +| flu_A_315675_resistance | String | resistance mutations to A_315675 | ONT, PE | +| flu_amantadine_resistance | String | resistance mutations to amantadine | ONT, PE | +| flu_compound_367_resistance | String | resistance mutations to compound_367 | ONT, PE | +| flu_favipiravir_resistance | String | resistance mutations to favipiravir | ONT, PE | +| flu_fludase_resistance | String | resistance mutations to fludase | ONT, PE | +| flu_L_742_001_resistance | String | resistance mutations to L_742_001 | ONT, PE | +| flu_laninamivir_resistance | String | resistance mutations to laninamivir | ONT, PE | +| flu_oseltamivir_resistance | String | resistance mutations to oseltamivir (Tamiflu®) | ONT, PE | +| flu_peramivir_resistance | String | resistance mutations to peramivir (Rapivab®) | ONT, PE | +| flu_pimodivir_resistance | String | resistance mutations to pimodivir | ONT, PE | +| flu_rimantadine_resistance | String | resistance mutations to rimantadine | ONT, PE | +| flu_xofluza_resistance | String | resistance mutations to xofluza (Baloxavir marboxil) | ONT, PE | +| flu_zanamivir_resistance | String | resistance mutations to zanamivir (Relenza®) | ONT, PE | +| genoflu_all_segments | String | The genotypes for each individual flu segment | FASTA, ONT, PE | +| genoflu_genotype | String | The genotype of the whole genome, based off of the individual segments types | FASTA, ONT, PE | +| genoflu_output_tsv | File | The output file from GenoFLU | FASTA, ONT, PE | +| genoflu_version | String | The version of GenoFLU used | FASTA, ONT, PE | +| irma_docker | String | Docker image used to run IRMA | ONT, PE | +| irma_ha_segment_fasta | File | HA (Haemagglutinin) assembly fasta file | ONT, PE | +| irma_mp_segment_fasta | File | MP (Matrix Protein) assembly fasta file | ONT, PE | +| irma_na_segment_fasta | File | NA (Neuraminidase) assembly fasta file | ONT, PE | +| irma_np_segment_fasta | File | NP (Nucleoprotein) assembly fasta file | ONT, PE | +| irma_ns_segment_fasta | File | NS (Nonstructural) assembly fasta file | ONT, PE | +| irma_pa_segment_fasta | File | PA (Polymerase acidic) assembly fasta file | ONT, PE | +| irma_pb1_segment_fasta | File | PB1 (Polymerase basic 1) assembly fasta file | ONT, PE | +| irma_pb2_segment_fasta | File | PB2 (Polymerase basic 2) assembly fasta file | ONT, PE | +| irma_subtype | String | Flu subtype as determined by IRMA | ONT, PE | +| irma_subtype_notes | String | Helpful note to user about Flu B subtypes. Output will be blank for Flu A samples. For Flu B samples it will state: "IRMA does not differentiate Victoria and Yamagata Flu B lineages. See abricate_flu_subtype output column" | ONT, PE | +| irma_type | String | Flu type as determined by IRMA | ONT, PE | +| irma_version | String | Version of IRMA used | ONT, PE | +| ivar_tsv | File | Variant descriptor file generated by iVar variants | PE, SE | +| ivar_variant_proportion_intermediate | String | The proportion of variants of intermediate frequency | PE, SE | +| ivar_variant_version | String | Version of iVar for running the iVar variants command | PE, SE | +| ivar_vcf | File | iVar tsv output converted to VCF format | PE, SE | +| ivar_version_consensus | String | Version of iVar for running the iVar consensus command | PE, SE | +| ivar_version_primtrim | String | Version of iVar for running the iVar trim command | PE, SE | +| kraken_human | Float | Percent of human read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken_report | File | Full Kraken report | CL, ONT, PE, SE | +| kraken_report_dehosted | File | Full Kraken report after host removal | CL, ONT, PE | +| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken_target_organism | String | Percent of target organism read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken_target_organism_dehosted | String | Percent of target organism read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken_target_organism_name | String | The name of the target organism; e.g., "Monkeypox" or "Human immunodeficiency virus" | CL, ONT, PE, SE | +| kraken_version | String | Version of Kraken software used | CL, ONT, PE, SE | +| meanbaseq_trim | Float | Mean quality of the nucleotide basecalls aligned to the reference genome after primer trimming | CL, ONT, PE, SE | +| meanmapq_trim | Float | Mean quality of the mapped reads to the reference genome after primer trimming | CL, ONT, PE, SE | +| medaka_reference | String | Reference sequence used in medaka task | CL, ONT | +| medaka_vcf | File | A VCF file containing the identified variants | ONT | +| nanoplot_docker | String | Docker image used to run Nanoplot | ONT | +| nanoplot_html_clean | File | An HTML report describing the clean reads | ONT | +| nanoplot_html_raw | File | An HTML report describing the raw reads | ONT | +| nanoplot_num_reads_clean1 | Float | Number of clean reads | ONT | +| nanoplot_num_reads_raw1 | Float | Number of raw reads | ONT | +| nanoplot_r1_est_coverage_clean | Float | Estimated coverage on the clean reads by nanoplot | ONT | +| nanoplot_r1_est_coverage_raw | Float | Estimated coverage on the raw reads by nanoplot | ONT | +| nanoplot_r1_mean_q_clean | Float | Mean quality score of clean forward reads | ONT | +| nanoplot_r1_mean_q_raw | Float | Mean quality score of raw forward reads | ONT | +| nanoplot_r1_mean_readlength_clean | Float | Mean read length of clean forward reads | ONT | +| nanoplot_r1_mean_readlength_raw | Float | Mean read length of raw forward reads | ONT | +| nanoplot_r1_median_q_clean | Float | Median quality score of clean forward reads | ONT | +| nanoplot_r1_median_q_raw | Float | Median quality score of raw forward reads | ONT | +| nanoplot_r1_median_readlength_clean | Float | Median read length of clean forward reads | ONT | +| nanoplot_r1_median_readlength_raw | Float | Median read length of raw forward reads | ONT | +| nanoplot_r1_n50_clean | Float | N50 of clean forward reads | ONT | +| nanoplot_r1_n50_raw | Float | N50 of raw forward reads | ONT | +| nanoplot_r1_stdev_readlength_clean | Float | Standard deviation read length of clean forward reads | ONT | +| nanoplot_r1_stdev_readlength_raw | Float | Standard deviation read length of raw forward reads | ONT | +| nanoplot_tsv_clean | File | A TSV report describing the clean reads | ONT | +| nanoplot_tsv_raw | File | A TSV report describing the raw reads | ONT | +| nanoplot_version | String | Version of nanoplot tool used | ONT | +| nextclade_aa_dels | String | Amino-acid deletions as detected by NextClade. Will be blank for Flu | CL, FASTA, ONT, PE, SE | +| nextclade_aa_dels_flu_ha | String | Amino-acid deletions as detected by NextClade. Specific to flu; it includes deletions for HA segment | ONT, PE | +| nextclade_aa_dels_flu_na | String | Amino-acid deletions as detected by NextClade. Specific to Flu; it includes deletions for NA segment | ONT, PE | +| nextclade_aa_subs | String | Amino-acid substitutions as detected by Nextclade. Will be blank for Flu | CL, FASTA, ONT, PE, SE | +| nextclade_aa_subs_flu_ha | String | Amino-acid substitutions as detected by Nextclade. Specific to Flu; it includes substitutions for NA segment | ONT, PE | +| nextclade_aa_subs_flu_na | String | Amino-acid substitutions as detected by Nextclade. Specific to Flu; it includes substitutions for NA segment | ONT, PE | +| nextclade_clade | String | Nextclade clade designation, will be blank for Flu. | CL, FASTA, ONT, PE, SE | +| nextclade_clade_flu_ha | String | Nextclade clade designation, specific to Flu NA segment | ONT, PE | +| nextclade_clade_flu_na | String | Nextclade clade designation, specific to Flu HA segment | ONT, PE | +| nextclade_docker | String | Docker image used to run Nextclade | CL, FASTA, ONT, PE, SE | +| nextclade_ds_tag | String | Dataset tag used to run Nextclade. Will be blank for Flu | CL, FASTA, ONT, PE, SE | +| nextclade_ds_tag_flu_ha | String | Dataset tag used to run Nextclade, specific to Flu HA segment | ONT, PE | +| nextclade_ds_tag_flu_na | String | Dataset tag used to run Nextclade, specific to Flu NA segment | ONT, PE | +| nextclade_json | File | Nextclade output in JSON file format. Will be blank for Flu | CL, FASTA, ONT, PE, SE | +| nextclade_json_flu_ha | File | Nextclade output in JSON file format, specific to Flu HA segment | ONT, PE | +| nextclade_json_flu_na | File | Nextclade output in JSON file format, specific to Flu NA segment | ONT, PE | +| nextclade_lineage | String | Nextclade lineage designation | CL, FASTA, ONT, PE, SE | +| nextclade_qc | String | QC metric as determined by Nextclade. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | +| nextclade_qc_flu_ha | String | QC metric as determined by Nextclade, specific to Flu HA segment | ONT, PE | +| nextclade_qc_flu_na | String | QC metric as determined by Nextclade, specific to Flu NA segment | ONT, PE | +| nextclade_tsv | File | Nextclade output in TSV file format. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | +| nextclade_tsv_flu_ha | File | Nextclade output in TSV file format, specific to Flu HA segment | ONT, PE | +| nextclade_tsv_flu_na | File | Nextclade output in TSV file format, specific to Flu NA segment | ONT, PE | +| nextclade_version | String | The version of Nextclade software used | CL, FASTA, ONT, PE, SE | +| number_Degenerate | Int | Number of degenerate basecalls within the consensus assembly | CL, FASTA, ONT, PE, SE | +| number_N | Int | Number of fully ambiguous basecalls within the consensus assembly | CL, FASTA, ONT, PE, SE | +| number_Total | Int | Total number of nucleotides within the consensus assembly | CL, FASTA, ONT, PE, SE | +| pango_lineage | String | Pango lineage as determined by Pangolin | CL, FASTA, ONT, PE, SE | +| pango_lineage_expanded | String | Pango lineage without use of aliases; e.g., "BA.1" → "B.1.1.529.1" | CL, FASTA, ONT, PE, SE | +| pango_lineage_report | File | Full Pango lineage report generated by Pangolin | CL, FASTA, ONT, PE, SE | +| pangolin_assignment_version | String | The version of the pangolin software (e.g. PANGO or PUSHER) used for lineage assignment | CL, FASTA, ONT, PE, SE | +| pangolin_conflicts | String | Number of lineage conflicts as determined by Pangolin | CL, FASTA, ONT, PE, SE | +| pangolin_docker | String | Docker image used to run Pangolin | CL, FASTA, ONT, PE, SE | +| pangolin_notes | String | Lineage notes as determined by Pangolin | CL, FASTA, ONT, PE, SE | +| pangolin_versions | String | All Pangolin software and database versions | CL, FASTA, ONT, PE, SE | +| percent_reference_coverage | Float | Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of the reference genome (SC2: 29903) x 100 | CL, FASTA, ONT, PE, SE | +| primer_bed_name | String | Name of the primer bed files used for primer trimming | CL, ONT, PE, SE | +| primer_trimmed_read_percent | Float | Percentage of read data with primers trimmed as determined by iVar trim | PE, SE | +| qc_check | String | The results of the QC Check task | CL, FASTA, ONT, PE, SE | +| qc_standard | File | The file used in the QC Check task containing the QC thresholds. | CL, FASTA, ONT, PE, SE | +| quasitools_coverage_file | File | The coverage report created by Quasitools HyDRA | ONT, PE | +| quasitools_date | String | Date of Quasitools analysis | ONT, PE | +| quasitools_dr_report | File | Drug resistance report created by Quasitools HyDRA | ONT, PE | +| quasitools_hydra_vcf | File | The VCF created by Quasitools HyDRA | ONT, PE | +| quasitools_mutations_report | File | The mutation report created by Quasitools HyDRA | ONT, PE | +| quasitools_version | String | Version of Quasitools used | ONT, PE | +| read_screen_clean | String | A PASS or FAIL flag for input reads after cleaning | ONT, PE, SE | +| read_screen_raw | String | A PASS or FAIL flag for input reads | ONT, PE, SE | +| read1_aligned | File | Forward read file of only aligned reads | CL, ONT, PE, SE | +| read1_clean | File | Forward read file after quality trimming and adapter removal | PE, SE | +| read1_dehosted | File | Dehosted forward reads; suggested read file for SRA submission | CL, ONT, PE | +| read1_trimmed | File | Forward read file after quality trimming and adapter removal | ONT | +| read1_unaligned | File | Forward read file of unaligned reads | PE, SE | +| read2_aligned | File | Reverse read file of only aligned reads | PE | +| read2_clean | File | Reverse read file after quality trimming and adapter removal | PE | +| read2_dehosted | File | Dehosted reverse reads; suggested read file for SRA submission | PE | +| read2_unaligned | File | Reverse read file of unaligned reads | PE | +| samtools_version | String | The version of SAMtools used to sort and index the alignment file | ONT, PE, SE | +| samtools_version_consensus | String | The version of SAMtools used to create the pileup before running iVar consensus | PE, SE | +| samtools_version_primtrim | String | The version of SAMtools used to create the pileup before running iVar trim | PE, SE | +| samtools_version_stats | String | The version of SAMtools used to assess the quality of read mapping | CL, PE, SE | +| sc2_s_gene_mean_coverage | Float | Mean read depth for the S gene in SARS-CoV-2 | CL, ONT, PE, SE | +| sc2_s_gene_percent_coverage | Float | Percent coverage of the S gene in SARS-CoV-2 | CL, ONT, PE, SE | +| seq_platform | String | Description of the sequencing methodology used to generate the input read data | CL, FASTA, ONT, PE, SE | +| sorted_bam_unaligned | File | A BAM file that only contains reads that did not align to the reference | PE, SE | +| sorted_bam_unaligned_bai | File | Index companion file to a BAM file that only contains reads that did not align to the reference | PE, SE | +| theiacov_clearlabs_analysis_date | String | Date of analysis | CL | +| theiacov_clearlabs_version | String | Version of PHB used for running the workflow | CL | +| theiacov_fasta_analysis_date | String | Date of analysis | FASTA | +| theiacov_fasta_version | String | Version of PHB used for running the workflow | FASTA | +| theiacov_illumina_pe_analysis_date | String | Date of analysis | PE | +| theiacov_illumina_pe_version | String | Version of PHB used for running the workflow | PE | +| theiacov_illumina_se_analysis_date | String | Date of analysis | SE | +| theiacov_illumina_se_version | String | Version of PHB used for running the workflow | SE | +| theiacov_ont_analysis_date | String | Date of analysis | ONT | +| theiacov_ont_version | String | Version of PHB used for running the workflow | ONT | +| trimmomatic_docker | String | Docker container used with trimmomatic | PE, SE | +| trimmomatic_version | String | The version of Trimmomatic used | PE, SE | +| vadr_alerts_list | File | A file containing all of the fatal alerts as determined by VADR | CL, FASTA, ONT, PE, SE | +| vadr_all_outputs_tar_gz | File | A .tar.gz file (gzip-compressed tar archive file) containing all outputs from the VADR command v-annotate.pl. This file must be uncompressed & extracted to see the many files within. See https://github.com/ncbi/vadr/blob/master/documentation/formats.md#format-of-v-annotatepl-output-filesfor more complete description of all files present within the archive. Useful when deeply investigating a sample's genome & annotations. | CL, FASTA, ONT, PE, SE | +| vadr_classification_summary_file | File | Per-sequence tabular classification file. See https://github.com/ncbi/vadr/blob/master/documentation/formats.md#explanation-of-sqc-suffixed-output-files for more complete description. | CL, FASTA, ONT, PE, SE | +| vadr_docker | String | Docker image used to run VADR | CL, FASTA, ONT, PE, SE | +| vadr_fastas_zip_archive | File | Zip archive containing all fasta files created during VADR analysis | CL, FASTA, ONT, PE, SE | +| vadr_feature_tbl_fail | File | 5 column feature table output for failing sequences. See https://github.com/ncbi/vadr/blob/master/documentation/formats.md#format-of-v-annotatepl-output-files for more complete description. | CL, FASTA, ONT, PE, SE | +| vadr_feature_tbl_pass | File | 5 column feature table output for passing sequences. See https://github.com/ncbi/vadr/blob/master/documentation/formats.md#format-of-v-annotatepl-output-files for more complete description. | CL, FASTA, ONT, PE, SE | +| vadr_num_alerts | String | Number of fatal alerts as determined by VADR | CL, FASTA, ONT, PE, SE | +| variants_from_ref_vcf | File | Number of variants relative to the reference genome | CL | + +??? toggle "TheiaCoV_FASTA_Batch_PHB Outputs" + + ##### TheiaCoV_FASTA_Batch Outputs {#theiacov-fasta-batch-outputs} + + !!! warning "Overwrite Warning" + **TheiaCoV_FASTA_Batch_PHB** workflow will **output results to the set-level data table in addition to overwriting the Pangolin & Nextclade output columns in the sample-level data table**. Users can view the set-level workflow output TSV file called `"Datatable"` to view exactly which columns were overwritten in the sample-level data table. + + | **Variable** | **Type** | **Description** | + |---|---|---| + | datatable | File | Sample-level data table TSV file that was used to update the original sample-level data table in the last step of the TheiaCoV_FASTA_Batch workflow. | + | nextclade_json | File | Output Nextclade JSON file that contains results for all samples included in the workflow | + | nextclade_tsv | File | Output Nextclade TSV file that contains results for all samples included in the workflow | + | pango_lineage_report | File | Output Pangolin CSV file that contains results for all samples included in the workflow | + | theiacov_fasta_batch_analysis_date | String | Date that the workflow was run. | + | theiacov_fasta_batch_version | String | Version of the workflow that was used. | diff --git a/docs/workflows/genomic_characterization/theiaeuk.md b/docs/workflows/genomic_characterization/theiaeuk.md new file mode 100644 index 000000000..265479ad6 --- /dev/null +++ b/docs/workflows/genomic_characterization/theiaeuk.md @@ -0,0 +1,510 @@ +# TheiaEuk + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibliity** | **Workflow Level** | +|---|---|---|---|---| +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Mycotics](../../workflows_overview/workflows_kingdom.md/#mycotics) | PHB v2.2.0 | Yes | Sample-level | + +## TheiaEuk Workflows + +**The TheiaEuk_PE workflow is for the assembly, quality assessment, and characterization of fungal genomes.** It is designed to accept Illumina paired-end sequencing data as the primary input. **It is currently intended only for haploid fungal genomes like _Candida auris_.** Analyzing diploid genomes using TheiaEuk should be attempted only with expert attention to the resulting genome quality. + +All input reads are processed through "core tasks" in each workflow. The core tasks include raw-read quality assessment, read cleaning (quality trimming and adapter removal), de novo assembly, assembly quality assessment, and species taxon identification. For some taxa identified, "taxa-specific sub-workflows" will be automatically activated, undertaking additional taxa-specific characterization steps, including clade-typing and/or antifungal resistance detection. + +!!! caption "TheiaEuk Workflow Diagram" + ![TheiaEuk Workflow Diagram](../../assets/figures/TheiaEuk_Illumina_PE.png){width=75%} + +### Inputs + +!!! info "Input read data" + + The TheiaEuk_PE workflow takes in Illumina paired-end read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) prior to Terra upload to minimize data upload time. + + By default, the workflow anticipates 2 x 150bp reads (i.e. the input reads were generated using a 300-cycle sequencing kit). Modifications to the optional parameter for `trim_minlen` may be required to accommodate shorter read data, such as the 2 x 75bp reads generated using a 150-cycle sequencing kit. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| theiaeuk_pe | **read1** | File | Unprocessed Illumina forward read file | | Required | +| theiaeuk_pe | **read2** | File | Unprocessed Illumina reverse read file | | Required | +| theiaeuk_pe | **samplename** | String | Name of Terra datatable | | Required | +| busco | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| busco | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| busco | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/ezlabgva/busco:v5.3.2_cv1 | Optional | +| cg_pipeline_clean | **cg_pipe_opts** | String | Options to pass to CG-Pipeline for clean read assessment | --fast | Optional | +| cg_pipeline_clean | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| cg_pipeline_clean | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f | Optional | +| cg_pipeline_raw | **cg_pipe_opts** | String | Options to pass to CG-Pipeline for clean read assessment | --fast | Optional | +| cg_pipeline_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| cg_pipeline_raw | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f | Optional | +| clean_check_reads | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| clean_check_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| clean_check_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/bactopia/gather_samples:2.0.2 | Optional | +| clean_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| clean_check_reads | **organism** | String | Internal component, do not modify | | Do Not Modify, Optional | +| clean_check_reads | **workflow_series** | String | Internal component, do not modify | | Do Not Modify, Optional | +| gambit | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| gambit | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/gambit:0.5.0 | Optional | +| merlin_magic | **agrvate_docker_image** | String | Internal component, do not modify | "us-docker.pkg.dev/general-theiagen/biocontainers/agrvate:1.0.2--hdfd78af_0" | Do Not Modify, Optional | +| merlin_magic | **assembly_only** | Boolean | Internal component, do not modify | | Do Not Modify, Optional | +| merlin_magic | **call_poppunk** | Boolean | Internal component, do not modify | TRUE | Do Not Modify, Optional | +| merlin_magic | **call_shigeifinder_reads_input** | Boolean | Internal component, do not modify | FALSE | Do Not Modify, Optional | +| merlin_magic | **emmtypingtool_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/staphb/emmtypingtool:0.0.1 | Do Not Modify, Optional | +| merlin_magic | **hicap_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/biocontainers/hicap:1.0.3--py_0 | Do Not Modify, Optional | +| merlin_magic | **ont_data** | Boolean | Internal component, do not modify | | Do Not Modify, Optional | +| merlin_magic | **paired_end** | Boolean | Internal component, do not modify | | Do Not Modify, Optional | +| merlin_magic | **pasty_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/staphb/pasty:1.0.3 | Do Not Modify, Optional | +| merlin_magic | **pasty_min_coverage** | Int | Internal component, do not modify | 95 | Do Not Modify, Optional | +| merlin_magic | **pasty_min_pident** | Int | Internal component, do not modify | 95 | Do Not Modify, Optional | +| merlin_magic | **shigatyper_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/staphb/shigatyper:2.0.5 | Do Not Modify, Optional | +| merlin_magic | **shigeifinder_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/staphb/shigeifinder:1.3.5 | Do Not Modify, Optional | +| merlin_magic | **snippy_query_gene** | String | Internal component, do not modify | | Do Not Modify, Optional | +| merlin_magic | **srst2_gene_max_mismatch** | Int | Internal component, do not modify | 2000 | Do Not Modify, Optional | +| merlin_magic | **srst2_max_divergence** | Int | Internal component, do not modify | 20 | Do Not Modify, Optional | +| merlin_magic | **srst2_min_cov** | Int | Internal component, do not modify | 80 | Do Not Modify, Optional | +| merlin_magic | **srst2_min_depth** | Int | Internal component, do not modify | 5 | Do Not Modify, Optional | +| merlin_magic | **srst2_min_edge_depth** | Int | Internal component, do not modify | 2 | Do Not Modify, Optional | +| merlin_magic | **staphopia_sccmec_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0 | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_coverage_threshold** | Int | Internal component, do not modify | 100 | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_debug** | Boolean | Internal component, do not modify | FALSE | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.3.6 | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_min_depth** | Int | Internal component, do not modify | 10 | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_operator** | String | Internal component, do not modify | "Operator not provided" | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_output_seq_method_type** | String | Internal component, do not modify | "WGS" | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_output_seq_method_type** | String | Internal component, do not modify | "Sequencing method not provided" | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_additional_outputs** | Boolean | Internal component, do not modify | FALSE | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_cov_frac_threshold** | Int | Internal component, do not modify | 1 | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_custom_db** | File | Internal component, do not modify | | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_mapper** | String | Internal component, do not modify | bwa | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_min_af** | Float | Internal component, do not modify | 0.1 | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_min_af_pred** | Float | Internal component, do not modify | 0.1 | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_min_depth** | Int | Internal component, do not modify | 10 | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_run_custom_db** | Boolean | Internal component, do not modify | FALSE | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_variant_caller** | String | Internal component, do not modify | freebayes | Do Not Modify, Optional | +| merlin_magic | **tbprofiler_variant_calling_params** | String | Internal component, do not modify | None | Do Not Modify, Optional | +| merlin_magic | **virulencefinder_coverage_threshold** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| merlin_magic | **virulencefinder_database** | String | Internal component, do not modify | "virulence_ecoli" | Do Not Modify, Optional | +| merlin_magic | **virulencefinder_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/staphb/virulencefinder:2.0.4 | Do Not Modify, Optional | +| merlin_magic | **virulencefinder_identity_threshold** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **ani_highest_percent** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **ani_highest_percent_bases_aligned** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **assembly_length_unambiguous** | Int | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **assembly_mean_coverage** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| qc_check_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| qc_check_task | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" | Optional | +| qc_check_task | **kraken_human** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken_human_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **meanbaseq_trim** | String | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| qc_check_task | **midas_secondary_genus_abundance** | Int | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **midas_secondary_genus_coverage** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **number_Degenerate** | Int | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **number_N** | Int | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **percent_reference_coverage** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **sc2_s_gene_mean_coverage** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **sc2_s_gene_percent_coverage** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **vadr_num_alerts** | String | Internal component, do not modify | | Do Not Modify, Optional | +| quast | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| quast | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/quast:5.0.2 | Optional | +| quast | **min_contig_length** | Int | Minimum length of contig for QUAST | 500 | Optional | +| rasusa_task | **bases** | String | Explicitly set the number of bases required e.g., 4.3kb, 7Tb, 9000, 4.1MB | | Optional | +| rasusa_task | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| rasusa_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| rasusa_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/rasusa:0.7.0 | Optional | +| rasusa_task | **frac** | Float | Subsample to a fraction of the reads | | Optional | +| rasusa_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| rasusa_task | **num** | Int | Subsample to a specific number of reads | | Optional | +| rasusa_task | **seed** | Int | Random seed to use | | Optional | +| raw_check_reads | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| raw_check_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| raw_check_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/bactopia/gather_samples:2.0.2 | Optional | +| raw_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| raw_check_reads | **organism** | String | Internal component, do not modify | | Do Not Modify, Optional | +| raw_check_reads | **workflow_series** | String | Internal component, do not modify | | Do Not Modify, Optional | +| read_QC_trim | **adapters** | File | File with adapter sequences to be removed | | Optional | +| read_QC_trim | **bbduk_mem** | Int | Memory allocated to the BBDuk VM | 8 | Optional | +| read_QC_trim | **call_kraken** | Boolean | If true, Kraken2 is executed on the dataset | FALSE | Optional | +| read_QC_trim | **call_midas** | Boolean | Internal component, do not modify | FALSE | Do Not Modify, Optional | +| read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | --detect_adapter_for_pe -g -5 20 -3 20 | Optional | +| read_QC_trim | **kraken_db** | File | Database to use with kraken2 | | Optional | +| read_QC_trim | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | | Optional | +| read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | | Optional | +| read_QC_trim | **midas_db** | File | Internal component, do not modify | | Do Not Modify, Optional | +| read_QC_trim | **phix** | File | A file containing the phix used during Illumina sequencing; used in the BBDuk task | | Optional | +| read_QC_trim | **read_processing** | String | Read trimming software to use, either "trimmomatic" or "fastp" | trimmomatic | Optional | +| read_QC_trim | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | "fastq_scan" | Optional | +| read_QC_trim | **target_organism** | String | This string is searched for in the kraken2 outputs to extract the read percentage | | Optional | +| read_QC_trim | **trim_minlength** | Int | Specifies minimum length of each read after trimming to be kept | 75 | Optional | +| read_QC_trim | **trim_quality_trim_score** | Int | Specifies the average quality of bases in a sliding window to be kept | 20 | Optional | +| read_QC_trim | **trim_window_size** | Int | Specifies window size for trimming (the number of bases to average the quality across) | 10 | Optional | +| read_QC_trim | **trimmomatic_args** | String | Additional arguments for trimmomatic | | Optional | +| read_QC_trim | **workflow_series** | String | Internal component, do not modify | | Do Not Modify, Optional | +| shovill_pe | **assembler** | String | Assembler to use (spades, skesa, velvet or megahit), see | "skesa" | Optional | +| shovill_pe | **assembler_options** | String | Assembler-specific options that you might choose, see | | Optional | +| shovill_pe | **depth** | Int | User specified depth of coverage for downsampling (see ) | 150 | Optional | +| shovill_pe | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| shovill_pe | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/shovill:1.1.0 | Optional | +| shovill_pe | **genome_length** | String | Internal component, do not modify | | Do Not Modify, Optional | +| shovill_pe | **kmers** | String | User-specified Kmer length to override choice made by Shovill, see | auto | Optional | +| shovill_pe | **min_contig_length** | Int | Minimum contig length to keep in final assembly | 200 | Optional | +| shovill_pe | **min_coverage** | Float | Minimum contig coverage to keep in final assembly | 2 | Optional | +| shovill_pe | **nocorr** | Boolean | Disable correction of minor assembly errors by Shovill (see ) | FALSE | Optional | +| shovill_pe | **noreadcorr** | Boolean | Disable correction of sequencing errors in reads by Shovill (see ) | FALSE | Optional | +| shovill_pe | **nostitch** | Boolean | Disable read stitching by Shovill (see ) | FALSE | Optional | +| shovill_pe | **trim** | Boolean | Enable adaptor trimming (see s) | FALSE | Optional | +| theiaeuk_pe | **busco_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| theiaeuk_pe | **call_rasusa** | Boolean | If true, launch rasusa task to subsample raw reads to read depth of 150X | TRUE | Optional | +| theiaeuk_pe | **gambit_db_genomes** | File | User-provided database of assembled query genomes; requires complementary signatures file. If not provided, uses default database, "/gambit-db" | gs://gambit-databases-rp/1.3.0/gambit-metadata-1.3-231016.gdb | Optional | +| theiaeuk_pe | **gambit_db_signatures** | File | User-provided signatures file; requires complementary genomes file. If not specified, the file from the docker container will be used. | gs://gambit-databases-rp/1.3.0/gambit-signatures-1.3-231016.gs | Optional | +| theiaeuk_pe | **genome_length** | Int | User-specified expected genome size to be used in genome statistics calculations | | Optional | +| theiaeuk_pe | **max_genome_size** | Int | Maximum genome size able to pass read screening | 50000000 | Optional | +| theiaeuk_pe | **min_basepairs** | Int | Minimum number of base pairs able to pass read screening | 2241820 | Optional | +| theiaeuk_pe | **min_coverage** | Int | Minimum genome coverage able to pass read screening | 10 | Optional | +| theiaeuk_pe | **min_genome_size** | Int | Minimum genome size able to pass read screening | 100000 | Optional | +| theiaeuk_pe | **min_proportion** | Int | Minimum proportion of total reads in each read file to pass read screening | 50 | Optional | +| theiaeuk_pe | **min_reads** | Int | Minimum number of reads to pass read screening | 10000 | Optional | +| theiaeuk_pe | **skip_screen** | Boolean | Option to skip the read screening prior to analysis | FALSE | Optional | +| theiaeuk_pe | **subsample_coverage** | Float | Read depth for RASUSA task to subsample reads to | 150 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow tasks (performed for all taxa) + +??? task "`versioning`: Version capture for TheiaEuk" + + The `versioning` task captures the workflow version from the GitHub (code repository) version. + + !!! techdetails "Version Capture Technical details" + + | | Links | + | --- | --- | + | Task | [task_versioning.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/task_versioning.wdl) | + +??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation" + + The [`screen`](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow: + + 1. Total number of reads: A sample will fail the read screening task if its total number of reads is less than or equal to `min_reads`. + 2. The proportion of basepairs reads in the forward and reverse read files: A sample will fail the read screening if fewer than `min_proportion` basepairs are in either the reads1 or read2 files. + 3. Number of basepairs: A sample will fail the read screening if there are fewer than `min_basepairs` basepairs + 4. Estimated genome size: A sample will fail the read screening if the estimated genome size is smaller than `min_genome_size` or bigger than `max_genome_size`. + 5. Estimated genome coverage: A sample will fail the read screening if the estimated genome coverage is less than the `min_coverage`. + + Read screening is undertaken on both the raw and cleaned reads. The task may be skipped by setting the `skip_screen` variable to true. + + Default values vary between the PE and SE workflow. The rationale for these default values can be found below. + + | Variable | Rationale | + | --- | --- | + | `skip_screen` | Prevent the read screen from running | + | `min_reads` | Minimum number of base pairs for 20x coverage of _Hansenula polymorpha_ divided by 300 (longest Illumina read length) | + | `min_basepairs` | Greater than 10x coverage of _Hansenula polymorpha_ | + | `min_genome_size` | Based on the _Hansenula polymorpha_ genome - the smallest fungal genome as of 2015-04-02 (8.97 Mbp) | + | `max_genome_size` | Based on the _Cenococcum geophilum_ genome, the biggest pathogenic fungal genome, (177.57 Mbp) | + | `min_coverage` | A bare-minimum coverage for genome characterization. Higher coverage would be required for high-quality phylogenetics. | + | `min_proportion` | Greater than 50% reads are in the read1 file; others are in the read2 file | + + !!! techdetails "Screen Technical Details" + + There is a single WDL task for read screening. The `screen` task is run twice, once for raw reads and once for clean reads. + + | | Links | + | --- | --- | + | Task | [task_screen.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_screen.wdl) | + +??? task "`rasusa`: Read subsampling" + + The RASUSA task performs subsampling of the raw reads. By default, this task will subsample reads to a depth of 150X using the estimated genome length produced during the preceding raw read screen. The user can prevent the task from being launched by setting the `call_rasusa`variable to false. + + The user can also provide an estimated genome length for the task to use for subsampling using the `genome_size` variable. In addition, the read depth can be modified using the `subsample_coverage` variable. + + !!! techdetails "RASUSA Technical Details" + + | | TheiaEuk_Illumina_PE_PHB | + | --- | --- | + | Task | [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) | + +??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" + + `read_QC_trim` is a sub-workflow within TheiaEuk that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. + + **Read quality trimming** + + Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_minlen`. + + If fastp is selected for analysis, fastp also implements the additional read-trimming steps indicated below: + + | **Parameter** | **Explanation** | + | --- | --- | + | -g | enables polyG tail trimming | + | -5 20 | enables read end-trimming | + | -3 20 | enables read end-trimming | + | --detect_adapter_for_pe | enables adapter-trimming **only for paired-end reads** | + + **Adapter removal** + + The `BBDuk` task removes adapters from sequence reads. To do this: + + - [Repair](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/) from the [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) package reorders reads in paired fastq files to ensure the forward and reverse reads of a pair are in the same position in the two fastq files. + - [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) (*"Bestus Bioinformaticus" Decontamination Using Kmers*) is then used to trim the adapters and filter out all reads that have a 31-mer match to [PhiX](https://emea.illumina.com/products/by-type/sequencing-kits/cluster-gen-sequencing-reagents/phix-control-v3.html), which is commonly added to Illumina sequencing runs to monitor and/or improve overall run quality. + + ??? toggle "What are adapters and why do they need to be removed?" + Adapters are manufactured oligonucleotide sequences attached to DNA fragments during the library preparation process. In Illumina sequencing, these adapter sequences are required for attaching reads to flow cells. You can read more about Illumina adapters [here](https://emea.support.illumina.com/bulletins/2020/06/illumina-adapter-portfolio.html). For genome analysis, it's important to remove these sequences since they're not actually from your sample. If you don't remove them, the downstream analysis may be affected. + + **Read Quantification** + + There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In TheiaProk_Illumina_PE, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality. + + **Read Identification (optional)** + + The `MIDAS` task is for the identification of reads to detect contamination with non-target taxa. This task is optional and turned off by default. It can be used by setting the `call_midas` input variable to `true`. + + The MIDAS tool was originally designed for metagenomic sequencing data but has been co-opted for use with bacterial isolate WGS methods. It can be used to detect contamination present in raw sequencing data by estimating bacterial species abundance in bacterial isolate WGS data. If a secondary genus is detected above a relative frequency of 0.01 (1%), then the sample should fail QC and be investigated further for potential contamination. + + This task is similar to those used in commercial software, BioNumerics, for estimating secondary species abundance. + + ??? toggle "How are the MIDAS output columns determined?" + + Example MIDAS report in the ****`midas_report` column: + + | species_id | count_reads | coverage | relative_abundance | + | --- | --- | --- | --- | + | Salmonella_enterica_58156 | 3309 | 89.88006645 | 0.855888033 | + | Salmonella_enterica_58266 | 501 | 11.60606061 | 0.110519371 | + | Salmonella_enterica_53987 | 99 | 2.232896237 | 0.021262881 | + | Citrobacter_youngae_61659 | 46 | 0.995216227 | 0.009477003 | + | Escherichia_coli_58110 | 5 | 0.123668877 | 0.001177644 | + + MIDAS report column descriptions: + + - species_id: species identifier + - count_reads: number of reads mapped to marker genes + - coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome + - relative_abundance: estimated relative abundance of species in metagenome + + The value in the `midas_primary_genus` column is derived by ordering the rows in order of "relative_abundance" and identifying the genus of top species in the "species_id" column (Salmonella). The value in the `midas_secondary_genus` column is derived from the genus of the second-most prevalent genus in the "species_id" column (Citrobacter). The `midas_secondary_genus_abundance` column is the "relative_abundance" of the second-most prevalent genus (0.009477003). The `midas_secondary_genus_coverage` is the "coverage" of the second-most prevalent genus (0.995216227). + + !!! techdetails "read_QC_trim Technical Details" + + | | Links | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/workflows/wf_read_QC_trim.wdl) | + | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_trimmomatic.wdl#L3) (PE subtask)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastq_scan.wdl#L3) (PE subtask)
[task_midas.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| + | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | + | Original Publication(s) | *[Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
*[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
*[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
*[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +??? task "`shovill`: _De novo_ Assembly" + + De Novo assembly will be undertaken only for samples that have sufficient read quantity and quality, as determined by the `screen` task assessment of clean reads. + + In TheiaEuk, assembly is performed using the [Shovill](https://github.com/tseemann/shovill) pipeline. This undertakes the assembly with one of four assemblers ([SKESA](https://github.com/ncbi/SKESA) (default), [SPAdes](https://github.com/ablab/spades), [Velvet](https://github.com/dzerbino/velvet/), [Megahit](https://github.com/voutcn/megahit)), but also performs [a number of pre- and post-processing steps](https://github.com/tseemann/shovill#main-steps) to improve the resulting genome assembly. Shovill uses an estimated genome size (see [here](https://github.com/tseemann/shovill#--gsize)). If this is not provided by the user as an optional input, Shovill will estimate the genome size using [mash](https://mash.readthedocs.io/en/latest/index.html). Adaptor trimming can be undertaken with Shovill by setting the `trim` option to "true", but this is set to "false" by default as [alternative adapter trimming](https://www.notion.so/TheiaProk-Workflow-Series-89b9c08406094ec78d08a578fe861626?pvs=21) is undertaken in the TheiaEuk workflow. + + ??? toggle "What is _de novo_ assembly?" + _De novo_ assembly is the process or product of attempting to reconstruct a genome from scratch (without prior knowledge of the genome) using sequence reads. Assembly of fungal genomes from short-reads will produce multiple contigs per chromosome rather than a single contiguous sequence for each chromosome. + + !!! techdetails "Shovill Technical Details" + | | Links | + | --- | --- | + | TheiaEuk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/assembly/task_shovill.wdl#L3) | + | Software code repository and documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | + +??? task "`QUAST`: Assembly Quality Assessment" + + [`QUAST`](https://github.com/ablab/quast) (**QU**ality **AS**sessment **T**ool) evaluates genome assemblies by computing several metrics that describe the assembly quality, including the total number of bases in the assembly, the length of the largest contig in the assembly, and the assembly percentage GC content. + + !!! techdetails "QUAST Technical Details" + + | | Links | + | --- | --- | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | + | Software Documentation | https://quast.sourceforge.net/docs/manual.html | + | Orginal publication | [QUAST: quality assessment tool for genome assemblies](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3624806/) | + +??? task "`CG-Pipeline`: Assessment of Read Quality, and Estimation of Genome Coverage" + + The`cg_pipeline` task generates metrics about read quality and estimates the coverage of the genome using the "run_assembly_readMetrics.pl" script from [CG-Pipeline](https://github.com/lskatz/CG-Pipeline/). The genome coverage estimates are calculated using both using raw and cleaned reads, using either a user-provided `genome_size` or the estimated genome length generated by QUAST. + + !!! techdetails "CG-Pipeline Technical Details" + The `cg_pipeline` task is run twice in TheiaEuk, once with raw reads, and once with clean reads. + + | | Links | + | --- | --- | + | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_cg_pipeline.wdl) | + | Software Source Code | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | + | Software Documentation | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | + | Original Publication(s) | [A computational genomics pipeline for prokaryotic sequencing projects](https://academic.oup.com/bioinformatics/article/26/15/1819/188418) | + +??? task "`GAMBIT`: **Taxon Assignment**" + + [`GAMBIT`](https://github.com/jlumpe/gambit) determines the taxon of the genome assembly using a k-mer based approach to match the assembly sequence to the closest complete genome in a database, thereby predicting its identity. Sometimes, GAMBIT can confidently designate the organism to the species level. Other times, it is more conservative and assigns it to a higher taxonomic rank. + + For additional details regarding the GAMBIT tool and a list of available GAMBIT databases for analysis, please consult the [GAMBIT](https://www.notion.so/GAMBIT-7c1376b861d0486abfbc316480046bdc?pvs=21) tool documentation. + + !!! techdetails "GAMBIT Technical Details" + + | | Links | + | --- | --- | + | Task | [task_gambit.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_gambit.wdl) | + | Software Source Code | [GAMBIT on GitHub](https://github.com/jlumpe/gambit) | + | Software Documentation | [GAMBIT ReadTheDocs](https://gambit-genomics.readthedocs.io/en/latest/) | + | Original Publication(s) | [GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0277575) | + +??? task "**`QC_check`: Check QC Metrics Against User-Defined Thresholds (optional)**" + + The `qc_check` task compares generated QC metrics against user-defined thresholds for each metric. This task will run if the user provides a `qc_check_table` .tsv file. If all QC metrics meet the threshold, the `qc_check` output variable will read `QC_PASS`. Otherwise, the output will read `QC_NA` if the task could not proceed or `QC_ALERT` followed by a string indicating what metric failed. + + The `qc_check` task applies quality thresholds according to the sample taxa. The sample taxa is taken from the `gambit_predicted_taxon` value inferred by the GAMBIT module OR can be manually provided by the user using the `expected_taxon` workflow input. + + ??? toggle "Formatting the _qc_check_table.tsv_" + + - The first column of the qc_check_table lists the taxa that the task will assess and the header of this column must be "taxon". + - Any genus or species can be included as a row of the qc_check_table. However, these taxa must **uniquely** match the sample taxa, meaning that the file can include multiple species from the same genus (Vibrio_cholerae and Vibrio_vulnificus), but not both a genus row and species within that genus (Vibrio and Vibrio cholerae). **The taxa should be formatted with the first letter capitalized and underscores in lieu of spaces.** + - Each subsequent column indicates a QC metric and lists a threshold for each taxa that will be checked. **The column names must exactly match expected values, so we highly recommend copy and pasting from the template files below.** + + ??? toggle "Template _qc_check_table.tsv_ files" + + TheiaEuk_Illumina_PE_PHB: [theiaeuk_qc_check_template.tsv](../../assets/files/TheiaEuk_qc_check_template.tsv) + + !!! warning "Example Purposes Only" + QC threshold values shown are for example purposes only and should not be presumed to be sufficient for every dataset. + + !!! techdetails "QC_Check Technical Details" + + | | Links | + | --- | --- | + | Task | [task_qc_check.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_qc_check.wdl) | + +### Organism-specific Characterization + +The TheiaEuk workflow automatically activates taxa-specific tasks after identification of relevant taxa using `GAMBIT`. Many of these taxa-specific tasks do not require any additional workflow tasks from the user. + +??? toggle "_Candida auris_" + + Two tools are deployed when _Candida auris is_ identified. First, the Cladetyping tool is launched to determine the clade of the specimen by comparing the sequence to five clade-specific reference files. The output of the clade typing task will be used to specify the reference genome for the antifungal resistance detection tool. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, then these variants are queried for product names associated with resistance according to the MARDy database (). + + **Default reference genomes used for clade typing and antimicrobial resistance gene detection of C. auris** + + | Clade | Genome Accession | Assembly Name | Strain | NCBI Submitter | Included mutations in AMR genes (not comprehensive) | + | --- | --- | --- | --- | --- | --- | + | Candida auris Clade I | GCA_002759435.2 | Cand_auris_B8441_V2 | B8441 | Centers for Disease Control and Prevention | | + | Candida auris Clade II | GCA_003013715.2 | ASM301371v2 | B11220 | Centers for Disease Control and Prevention | | + | Candida auris Clade III | GCA_002775015.1 | Cand_auris_B11221_V1 | B11221 | Centers for Disease Control and Prevention | _ERG11_ V125A/F126L | + | Candida auris Clade IV | GCA_003014415.1 | Cand_auris_B11243 | B11243 | Centers for Disease Control and Prevention | _ERG11_ Y132F | + | Candida auris Clade V | GCA_016809505.1 | ASM1680950v1 | IFRC2087 | Centers for Disease Control and Prevention | | + + The genes in which there are known resistance-conferring mutations for this pathogen are: + + - FKS1 + - ERG11 (lanosterol 14-alpha demethylase) + - FUR1 (uracil phosphoribosyltransferase) + + Mutations in these genes that are known to confer resistance are shown below (source: MARDy database http://mardy.dide.ic.ac.uk/index.php) + + | **Organism** | **Found in** | **Gene name** | **Gene locus** | **AA mutation** | **Drug** | **Tandem repeat name** | **Tandem repeat sequence** | **Reference** | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | **Candida auris** | **Human** | **ERG11** | | **Y132F** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **ERG11** | | **K143R** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **ERG11** | | **F126T** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Micafungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Caspofungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Anidulafungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Micafungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Caspofungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Anidulafungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FUR1** | **CAMJ_004922** | **F211I** | **5-flucytosine** | | | [**https://doi.org/10.1038/s41426-018-0045-x**](https://www.nature.com/articles/s41426-018-0045-x) | + +??? toggle "_Candida albicans_" + + When this species is detected by the taxon ID tool, an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + + The genes in which there are known resistance-conferring mutations for this pathogen are: + + - ERG11 + - GCS1 (FKS1) + - FUR1 + - RTA2 + +??? toggle "_Aspergillus fumigatus_" + + When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + + The genes in which there are known resistance-conferring mutations for this pathogen are: + + - Cyp51A + - HapE + - COX10 (AFUA_4G08340) + +??? toggle "_Cryptococcus neoformans_" + + When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + + The gene in which there are known resistance-conferring mutations for this pathogen is: + + - ERG11 (CNA00300) + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| cg_pipeline_docker | String | Docker file used for running CG-Pipeline on cleaned reads | +| cg_pipeline_report | File | TSV file of read metrics from raw reads, including average read length, number of reads, and estimated genome coverage | +| est_coverage_clean | Float | Estimated coverage calculated from clean reads and genome length | +| est_coverage_raw | Float | Estimated coverage calculated from raw reads and genome length | +| r1_mean_q_clean | Float | Mean quality score of clean forward reads | +| r1_mean_q_raw | Float | Mean quality score of raw forward reads | +| r2_mean_q_clean | Float | Mean quality score of clean reverse reads | +| r2_mean_q_raw | Float | Mean quality score of raw reverse reads | +| fastq_scan_version | String | Version of fastq-scan software used | +| gambit_closest_genomes | File | CSV file listing genomes in the GAMBIT database that are most similar to the query assembly | +| gambit_db_version | String | Version of GAMBIT used | +| gambit_docker | String | GAMBIT docker file used | +| gambit_predicted_taxon | String | Taxon predicted by GAMBIT | +| gambit_predicted_taxon_rank | String | Taxon rank of GAMBIT taxon prediction | +| gambit_report | File | GAMBIT report in a machine-readable format | +| gambit_version | String | Version of GAMBIT software used | +| assembly_length | Int | Length of assembly (total contig length) as determined by QUAST | +| n50_value | Int | N50 of assembly calculated by QUAST | +| number_contigs | Int | Total number of contigs in assembly | +| quast_report | File | TSV report from QUAST | +| quast_version | String | Software version of QUAST used | +| rasusa_version | String | Version of rasusa used | +| read1_subsampled | File | Subsampled read1 file | +| read2_subsampled | File | Subsampled read2 file | +| bbduk_docker | String | BBDuk docker image used | +| fastp_version | String | Version of fastp software used | +| read1_clean | File | Clean forward reads file | +| read2_clean | File | Clean reverse reads file | +| num_reads_clean_pairs | String | Number of read pairs after cleaning | +| num_reads_clean1 | Int | Number of forward reads after cleaning | +| num_reads_clean2 | Int | Number of reverse reads after cleaning | +| num_reads_raw_pairs | String | Number of input read pairs | +| num_reads_raw1 | Int | Number of input forward reads | +| num_reads_raw2 | Int | Number of input reverse reads | +| trimmomatic_version | String | Version of trimmomatic used | +| clean_read_screen | String | PASS or FAIL result from clean read screening; FAIL accompanied by the reason for failure | +| raw_read_screen | String | PASS or FAIL result from raw read screening; FAIL accompanied by thereason for failure | +| assembly_fasta | File | | +| contigs_fastg | File | Assembly graph if megahit used for genome assembly | +| contigs_gfa | File | Assembly graph if spades used for genome assembly | +| contigs_lastgraph | File | Assembly graph if velvet used for genome assembly | +| shovill_pe_version | String | Shovill version used | +| theiaeuk_snippy_variants_bam | File | BAM file produced by the snippy module | +| theiaeuk_snippy_variants_gene_query_results | File | File containing all lines from variants file matching gene query terms | +| theiaeuk_snippy_variants_hits | String | String of all variant file entries matching gene query term | +| theiaeuk_snippy_variants_outdir_tarball | File | Tar compressed file containing full snippy output directory | +| theiaeuk_snippy_variants_query | String | The gene query term(s) used to search variant | +| theiaeuk_snippy_variants_query_check | String | Were the gene query terms present in the refence annotated genome file | +| theiaeuk_snippy_variants_reference_genome | File | The reference genome used in the alignment and variant calling | +| theiaeuk_snippy_variants_results | File | The variants file produced by snippy | +| theiaeuk_snippy_variants_summary | File | A file summarizing the variants detected by snippy | +| theiaeuk_snippy_variants_version | String | The version of the snippy_variants module being used | +| seq_platform | String | Sequencing platform inout by the user | +| theiaeuk_illumina_pe_analysis_date | String | Date of TheiaProk workflow execution | +| theiaeuk_illumina_pe_version | String | TheiaProk workflow version used | diff --git a/docs/workflows/genomic_characterization/theiameta.md b/docs/workflows/genomic_characterization/theiameta.md new file mode 100644 index 000000000..8d0d5c301 --- /dev/null +++ b/docs/workflows/genomic_characterization/theiameta.md @@ -0,0 +1,374 @@ +# TheiaMeta + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.2.0 | Yes | Sample-level | + +## TheiaMeta Workflows + +Genomic characterization of pathogens is an increasing priority for public health laboratories globally. The workflows in the TheiaMeta Genomic Characterization Series make the analysis of pathogens from metagenomic samples easy by taking raw next-generation sequencing (NGS) data and generating metagenome-assembled genomes (MAGs), either using a reference-genome or not. + +TheiaMeta can use one of two distinct methods for generating and processing the final assembly: + +- **If a reference genome is not provided**, the _de novo_ assembly will be the final assembly. Additionally, go through a binning process where the contigs are separated into distinct files ("bins") according to composition and coverage such that each bin hopefully contains a single taxon. +- **If a reference genome is provided by the user**, the _de novo_ metagenomic assembly is filtered by mapping the contigs to the reference and those constitute the final assembly. No binning is necessary as the mapping will filter contigs that are likely the same taxon as the reference. + +!!! caption "TheiaMeta Workflow Diagram" + + ![TheiaMeta Workflow Diagram](../../assets/figures/TheiaMeta_Illumina_PE.png) + +### Inputs + +The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads generated for metagenomic characterization (typically by shotgun). By default, this workflow will assume that input reads were generated using a 300-cycle sequencing kit (i.e. 2 x 150 bp reads). Modifications to the optional parameter for `trim_minlen` may be required to accommodate shorter read data, such as 2 x 75bp reads generated using a 150-cycle sequencing kit. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| theiameta_illumina_pe | **read1** | File | Forward Illumina read in FASTQ file format | | Required | +| theiameta_illumina_pe | **read2** | File | Reverse Illumina read in FASTQ file format | | Required | +| theiameta_illumina_pe | **samplename** | String | Name of the sample being analyzed | | Required | +| assembled_reads_percent | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| assembled_reads_percent | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| assembled_reads_percent | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| assembled_reads_percent | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| bwa | **cpu** | Int | Number of CPUs to allocate to the task | 6 | Optional | +| bwa | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| bwa | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ivar:1.3.1-titan | Optional | +| bwa | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| calculate_coverage | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| calculate_coverage | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| calculate_coverage | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/bedtools:2.31.0 | Optional | +| calculate_coverage | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| calculate_coverage_paf | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| calculate_coverage_paf | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| calculate_coverage_paf | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/quay/ubuntu:latest | Optional | +| calculate_coverage_paf | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| compare_assemblies | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| kraken2_clean | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| kraken2_clean | **disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | +| kraken2_clean | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | +| kraken2_clean | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| kraken2_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| kraken2_raw | **disk_dize** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | +| kraken2_raw | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | +| kraken2_raw | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| krona_clean | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| krona_clean | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| krona_clean | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/krona:2.7.1--pl526_5 | Optional | +| krona_clean | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| krona_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| krona_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| krona_raw | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/krona:2.7.1--pl526_5 | Optional | +| krona_raw | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| metaspades | **kmers** | String | Kmer list to use with metaspades. If not provided metaspades automatically sets this value | | Optional | +| metaspades | **metaspades_opts** | String | Additional arguments to pass to metaspades task | | Optional | +| minimap2_assembly | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| minimap2_assembly | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| minimap2_assembly | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/minimap2:2.22 | Optional | +| minimap2_assembly | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| minimap2_assembly | **query2** | File | Internal component. Do not modify. | | Optional | +| minimap2_reads | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| minimap2_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| minimap2_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/minimap2:2.22 | Optional | +| minimap2_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| quast | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| quast | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| quast | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/quast:5.0.2 | Optional | +| quast | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| read_QC_trim | **adapters** | File | Adapter file to be trimmed by trimmomatic | | Optional | +| read_QC_trim | **bbduck_mem** | Int | Memory to use with bbduck | 8 | Optional | +| read_QC_trim | **call_midas** | Boolean | Optional to run Midas on input data | FALSE | Optional | +| read_QC_trim | **fastp_args** | String | Fastp-specific options that you might choose, see | | Optional | +| read_QC_trim | **midas_db** | File | A Midas database in .tar.gz format | gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | +| read_QC_trim | **phix** | File | | | Optional | +| read_QC_trim | **read_processing** | String | | | Optional | +| read_QC_trim | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | fastq_scan | Optional | +| read_QC_trim | **target_organism** | String | Internal component. Do not modify. | | Optional | +| read_QC_trim | **trim_min_length** | Int | | | Optional | +| read_QC_trim | **trim_window_size** | Int | | | Optional | +| read_QC_trim | **trimmomatic_args** | String | | | Optional | +| retrieve_aligned_contig_paf | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| retrieve_aligned_contig_paf | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| retrieve_aligned_contig_paf | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/seqkit:2.3.1 | Optional | +| retrieve_aligned_contig_paf | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| retrieve_aligned_pe_reads_sam | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| retrieve_aligned_pe_reads_sam | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| retrieve_aligned_pe_reads_sam | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| retrieve_aligned_pe_reads_sam | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| retrieve_unaligned_pe_reads_sam | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| retrieve_unaligned_pe_reads_sam | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| retrieve_unaligned_pe_reads_sam | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| retrieve_unaligned_pe_reads_sam | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| sam_to_sorted_bam | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| sam_to_sorted_bam | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| sam_to_sorted_bam | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| sam_to_sorted_bam | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| semibin | **cpu** | Int | Number of CPUs to allocate to the task | 6 | Optional | +| semibin | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| semibin | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/semibin:2.0.2--pyhdfd78af_0 | Optional | +| semibin | **environment** | String | Environment model to use. Options: • human_gut
• dog_gut
• ocean
• soil
• cat_gut
• human_oral
• mouse_gut
• pig_gut
• built_environment
• wastewater
• chicken_caecum
- global | global | Optional | +| semibin | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| semibin | **min_length** | Int | Minimum contig length for binning | 1000 | Optional | +| semibin | **ratio** | Float | If the ratio of the number of base pairs of contigs between 1000-2500 bp smaller than this value, the minimal length will be set as 1000bp, otherwise 2500bp. | 0.05 | Optional | +| sort_bam_assembly_correction | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| sort_bam_assembly_correction | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| sort_bam_assembly_correction | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| sort_bam_assembly_correction | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| theiameta_illumina_pe | **kraken2_db** | File | A Kraken2 database in .tar.gz format | gs://theiagen-public-files-rp/terra/theiaprok-files/k2_standard_08gb_20230605.tar.gz | Optional | +| theiameta_illumina_pe | **output_additional_files** | Boolean | Output additional files such as aligned and unaligned reads to reference | FALSE | Optional | +| theiameta_illumina_pe | **reference** | File | Reference file for consensus calling, in FASTA format | | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Tasks + +??? task "`versioning`: Version Capture for TheiaMeta" + + The `versioning` task captures the workflow version from the GitHub (code repository) version. + + !!! techdetails "Version Capture Technical details" + + | | Links | + | --- | --- | + | Task | [task_versioning.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/task_versioning.wdl) | + +#### Read Cleaning and QC + +??? task "`HRRT`: Human Host Sequence Removal" + + All reads of human origin **are removed**, including their mates, by using NCBI's [**human read removal tool (HRRT)**](https://github.com/ncbi/sra-human-scrubber). + + HRRT is based on the [SRA Taxonomy Analysis Tool](https://doi.org/10.1186/s13059-021-02490-0) and employs a k-mer database constructed of k-mers from Eukaryota derived from all human RefSeq records with any k-mers found in non-Eukaryota RefSeq records subtracted from the database. + + !!! techdetails "NCBI-Scrub Technical Details" + + | | Links | + | --- | --- | + | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_ncbi_scrub.wdl) | + | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber) | + | Software Documentation | | + +??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" + + `read_QC_trim` is a sub-workflow within TheiaMeta that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. + + **Read quality trimming** + + Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_minlen`. + + If fastp is selected for analysis, fastp also implements the additional read-trimming steps indicated below: + + | **Parameter** | **Explanation** | + | --- | --- | + | -g | enables polyG tail trimming | + | -5 20 | enables read end-trimming | + | -3 20 | enables read end-trimming | + | --detect_adapter_for_pe | enables adapter-trimming **only for paired-end reads** | + + **Adapter removal** + + The `BBDuk` task removes adapters from sequence reads. To do this: + + - [Repair](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/) from the [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) package reorders reads in paired fastq files to ensure the forward and reverse reads of a pair are in the same position in the two fastq files. + - [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) (*"Bestus Bioinformaticus" Decontamination Using Kmers*) is then used to trim the adapters and filter out all reads that have a 31-mer match to [PhiX](https://emea.illumina.com/products/by-type/sequencing-kits/cluster-gen-sequencing-reagents/phix-control-v3.html), which is commonly added to Illumina sequencing runs to monitor and/or improve overall run quality. + + ??? toggle "What are adapters and why do they need to be removed?" + Adapters are manufactured oligonucleotide sequences attached to DNA fragments during the library preparation process. In Illumina sequencing, these adapter sequences are required for attaching reads to flow cells. You can read more about Illumina adapters [here](https://emea.support.illumina.com/bulletins/2020/06/illumina-adapter-portfolio.html). For genome analysis, it's important to remove these sequences since they're not actually from your sample. If you don't remove them, the downstream analysis may be affected. + + **Read Quantification** + + There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In TheiaProk_Illumina_PE, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality. + + **Read Identification (optional)** + + The `MIDAS` task is for the identification of reads to detect contamination with non-target taxa. This task is optional and turned off by default. It can be used by setting the `call_midas` input variable to `true`. + + The MIDAS tool was originally designed for metagenomic sequencing data but has been co-opted for use with bacterial isolate WGS methods. It can be used to detect contamination present in raw sequencing data by estimating bacterial species abundance in bacterial isolate WGS data. If a secondary genus is detected above a relative frequency of 0.01 (1%), then the sample should fail QC and be investigated further for potential contamination. + + This task is similar to those used in commercial software, BioNumerics, for estimating secondary species abundance. + + ??? toggle "How are the MIDAS output columns determined?" + + Example MIDAS report in the ****`midas_report` column: + + | species_id | count_reads | coverage | relative_abundance | + | --- | --- | --- | --- | + | Salmonella_enterica_58156 | 3309 | 89.88006645 | 0.855888033 | + | Salmonella_enterica_58266 | 501 | 11.60606061 | 0.110519371 | + | Salmonella_enterica_53987 | 99 | 2.232896237 | 0.021262881 | + | Citrobacter_youngae_61659 | 46 | 0.995216227 | 0.009477003 | + | Escherichia_coli_58110 | 5 | 0.123668877 | 0.001177644 | + + MIDAS report column descriptions: + + - species_id: species identifier + - count_reads: number of reads mapped to marker genes + - coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome + - relative_abundance: estimated relative abundance of species in metagenome + + The value in the `midas_primary_genus` column is derived by ordering the rows in order of "relative_abundance" and identifying the genus of top species in the "species_id" column (Salmonella). The value in the `midas_secondary_genus` column is derived from the genus of the second-most prevalent genus in the "species_id" column (Citrobacter). The `midas_secondary_genus_abundance` column is the "relative_abundance" of the second-most prevalent genus (0.009477003). The `midas_secondary_genus_coverage` is the "coverage" of the second-most prevalent genus (0.995216227). + + !!! techdetails "read_QC_trim Technical Details" + + | | Links | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/workflows/wf_read_QC_trim.wdl) | + | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_trimmomatic.wdl#L3) (PE subtask)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastq_scan.wdl#L3) (PE subtask)
[task_midas.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| + | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | + | Original Publication(s) | *[Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
*[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
*[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
*[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +??? task "`kraken`: Taxonomic Classification" + + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. + + Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow + + !!! info "Database-dependent" + The Kraken2 software is database-dependent and **taxonomic assignments are highly sensitive to the database used**. An appropriate database should contain the expected organism(s) (e.g. _Escherichia coli_) and other taxa that may be present in the reads (e.g. _Citrobacter freundii_, a common contaminant). + + !!! techdetails "Kraken2 Technical Details" + + | | Links | + | --- | --- | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | + | Software Documentation | | + | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +#### Assembly + +??? task "`metaspades`: _De Novo_ Metagenomic Assembly" + + While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + + !!! techdetails "MetaSPAdes Technical Details" + + | | Links | + | --- | --- | + | Task | [task_metaspades.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_metaspades.wdl) | + | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | + | Software Documentation | | + | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5411777/) | + +??? task "`minimap2`: Assembly Alignment and Contig Filtering (if a reference is provided)" + + If a reference genome is provided through the **`reference`** optional input, the assembly produced with `metaspades` will be mapped to the reference genome with `minimap2`. The contigs which align to the reference are retrieved and returned in the **`assembly_fasta`** output. + +#### Assembly QC + +??? task "`quast`: Assembly Quality Assessment" + + QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome assemblies by computing various metrics without a reference being necessary. It includes useful metrics such as number of contigs, length of the largest contig and N50. + + !!! techdetails "QUAST Technical Details" + + | | Links | + | --- | --- | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | + | Software Documentation | | + | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | + +#### Binning + +??? task "`semibin2`: Metagenomic binning (if a reference is NOT provided)" + + If no reference genome is provided through the **`reference`** optional input, the assembly produced with `metaspades` will be binned with `semibin2`, a a command tool for metagenomic binning with deep learning. + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| assembly_fasta | File | Final assembly (MAG) | +| assembly_length | Int | Length of final assembly in basepairs | +| assembly_mean_coverage | Float | Mean depth of coverage of the final assembly | +| average_read_length | Float | Average read length of the clean reads | +| bbduk_docker | String | Docker image for bbduk | +| bedtools_docker | String | Docker image for bedtools | +| bedtools_version | String | Version of bedtools | +| contig number | Int | Number of contigs in final assembly | +| fastp_html_report | File | Report file for fastp in HTML format | +| fastp_version | String | Version of fastp used | +| fastq_scan_docker | String | Docker image of fastq_scan | +| fastq_scan_num_reads_clean_pairs | String | Number of read pairs after cleaning as calculated by fastq_scan | +| fastq_scan_num_reads_clean1 | Int | Number of forward reads after cleaning as calculated by fastq_scan | +| fastq_scan_num_reads_clean2 | Int | Number of reverse reads after cleaning as calculated by fastq_scan | +| fastq_scan_num_reads_raw_pairs | String | Number of input read pairs as calculated by fastq_scan | +| fastq_scan_num_reads_raw1 | Int | Number of input forward reads as calculated by fastq_scan | +| fastq_scan_num_reads_raw2 | Int | Number of input reserve reads as calculated by fastq_scan | +| fastq_scan_version | String | fastq_scan version | +| fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | +| fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | +| fastqc_docker | String | Docker container used for fastqc | +| fastqc_num_reads_clean_pairs | String | Number of read pairs after cleaning by fastqc | +| fastqc_num_reads_clean1 | Int | Number of forward reads after cleaning by fastqc | +| fastqc_num_reads_clean2 | Int | Number of reverse reads after cleaning by fastqc | +| fastqc_num_reads_raw_pairs | String | Number of input read pairs by fastqc | +| fastqc_num_reads_raw1 | Int | Number of input forward reads by fastqc | +| fastqc_num_reads_raw2 | Int | Number of input reverse reads by fastqc | +| fastqc_raw1_html | File | Graphical visualization of raw forward read quality from fastqc to open in an internet browser | +| fastqc_raw2_html | File | Graphical visualization of raw reverse read qualityfrom fastqc to open in an internet browser | +| fastqc_version | String | Version of fastqc software used | +| kraken2_docker | String | Docker image of kraken2 | +| kraken2_percent_human_clean | Float | Percentage of human-classified reads in the sample's clean reads | +| kraken2_percent_human_raw | Float | Percentage of human-classified reads in the sample's raw reads | +| kraken2_report_clean | File | Full Kraken report for the sample's clean reads | +| kraken2_report_raw | File | Full Kraken report for the sample's raw reads | +| kraken2_version | String | Version of kraken | +| krona_docker | String | Docker image of Krona | +| krona_html_clean | File | The KronaPlot after reads are cleaned | +| krona_html_raw | File | The KronaPlot before reads are cleaned | +| krona_version | String | Version of Krona | +| largest_contig | Int | Largest contig size | +| metaspades_docker | String | Docker image of metaspades | +| metaspades_version | String | Version of metaspades | +| minimap2_docker | String | Docker image of minimap2 | +| minimap2_version | String | Version of minimap2 | +| ncbi_scrub_docker | String | Docker image for NCBI's HRRT | +| percent_coverage | Float | Percentage coverage of the reference genome provided | +| percentage_mapped_reads | Float | Percentage of mapped reads to the assembly | +| pilon_docker | String | Docker image for pilon | +| pilon_version | String | Version of pilon | +| quast_docker | String | Docker image of QUAST | +| quast_version | String | Version of QUAST | +| read1_clean | File | Clean forward reads file | +| read1_dehosted | File | Dehosted forward reads file | +| read1_mapped | File | Mapped forward reads to the assembly | +| read1_unmapped | File | Unmapped forwards reads to the assembly | +| read2_clean | File | Clean reverse reads file | +| read2_dehosted | File | Dehosted reverse reads file | +| read2_mapped | File | Mapped reverse reads to the assembly | +| read2_unmapped | File | Unmapped reverse reads to the assembly | +| samtools_docker | String | Docker image of samtools | +| samtools_version | String | Version of samtools | +| semibin_bins | Array[File] | Array of binned metagenomic assembled genome files | +| semibin_docker | String | Docker image of semibin | +| semibin_version | String | Semibin version used | +| theiameta_illumina_pe_analysis_date | String | Date of analysis | +| theiameta_illumina_pe_version | String | Version of workflow | +| trimmomatic_docker | String | Docker image of trimmomatic | +| trimmomatic_version | String | Version of trimmomatic used | + +## References + +> **Human read removal tool (HRRT)**: + +>**Trimmomatic:** Anthony M. Bolger and others, Trimmomatic: a flexible trimmer for Illumina sequence data, *Bioinformatics*, Volume 30, Issue 15, August 2014, Pages 2114–2120,  + +>**Fastq-Scan:** + +>**metaSPAdes:** Sergey Nurk and others, metaSPAdes: a new versatile metagenomic assembler, *Genome Res.* 2017 May; 27(5): 824–834.,  + +>**Pilon:** Bruce J. Walker and others. Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement. *Plos One.* November 19, 2014. + +>**Minimap2:** Heng Li, Minimap2: pairwise alignment for nucleotide sequences, *Bioinformatics*, Volume 34, Issue 18, September 2018, Pages 3094–3100,  + +>**QUAST:** Alexey Gurevich and others, QUAST: quality assessment tool for genome assemblies, *Bioinformatics*, Volume 29, Issue 8, April 2013, Pages 1072–1075,  + +>**Samtools:** Li, Heng, Bob Handsaker, Alec Wysoker, Tim Fennell, Jue Ruan, Nils Homer, Gabor Marth, Goncalo Abecasis, Richard Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence Alignment/Map format and SAMtools. Bioinformatics 25(16): 2078-2079. + +>**Bcftools:** Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. Twelve years of SAMtools and BCFtools. GigaScience, Volume 10, Issue 2, February 2021, giab008, + +>**Semibin2:** Shaojun Pan, Xing-Ming Zhao, Luis Pedro Coelho, SemiBin2: self-supervised contrastive learning leads to better MAGs for short- and long-read sequencing, *Bioinformatics*, Volume 39, Issue Supplement_1, June 2023, Pages i21–i29,  diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md new file mode 100644 index 000000000..5ed33a0f1 --- /dev/null +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -0,0 +1,1976 @@ +# TheiaProk Workflow Series + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.2.0 | Yes, some optional features incompatible | Sample-level | + +## TheiaProk Workflows + +**The TheiaProk workflows are for the assembly, quality assessment, and characterization of bacterial genomes.** There are currently four TheiaProk workflows designed to accommodate different kinds of input data: + +1. Illumina paired-end sequencing **(TheiaProk_Illumina_PE**) +2. Illumina single-end sequencing (**TheiaProk_Illumina_SE)** +3. ONT sequencing (**TheiaProk_ONT**) +4. Genome assemblies (**TheiaProk_FASTA**) + +!!! caption "TheiaProk Workflow Diagram" + ![TheiaProk Workflow Diagram](../../assets/figures/TheiaProk.png) + +All input reads are processed through "[core tasks](#core-tasks-performed-for-all-taxa)" in the TheiaProk Illumina and ONT workflows. These undertake read trimming and assembly appropriate to the input data type. TheiaProk workflows subsequently launch default genome characterization modules for quality assessment, species identification, antimicrobial resistance gene detection, sequence typing, and more. **For some taxa identified, "taxa-specific sub-workflows" will be automatically activated, undertaking additional taxa-specific characterization steps.** When setting up each workflow, users may choose to use "optional tasks" as additions or alternatives to tasks run in the workflow by default. + +### Inputs + +!!! dna "" + ??? toggle "TheiaProk_Illumina_PE Input Read Data" + + The TheiaProk_Illumina_PE workflow takes in Illumina paired-end read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) before Terra uploads to minimize data upload time. + + By default, the workflow anticipates **2 x 150bp** reads (i.e. the input reads were generated using a 300-cycle sequencing kit). Modifications to the optional parameter for `trim_minlen` may be required to accommodate shorter read data, such as the 2 x 75bp reads generated using a 150-cycle sequencing kit. + + ??? toggle "TheiaProk_Illumina_SE Input Read Data" + + TheiaProk_Illumina_SE takes in Illumina single-end reads. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. Theiagen highly recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) before uploading to Terra to minimize data upload time & save on storage costs. + + By default, the workflow anticipates **1 x 35 bp** reads (i.e. the input reads were generated using a 70-cycle sequencing kit). Modifications to the optional parameter for `trim_minlen` may be required to accommodate longer read data. + + ??? toggle "TheiaProk_ONT Input Read Data" + + The TheiaProk_ONT workflow takes in base-called ONT read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) before uploading to Terra to minimize data upload time. + + **The ONT sequencing kit and base-calling approach can produce substantial variability in the amount and quality of read data. Genome assemblies produced by the TheiaProk_ONT workflow must be quality assessed before reporting results.** + + ??? toggle "TheiaProk_FASTA Input Assembly Data" + + The TheiaProk_FASTA workflow takes in assembly files in FASTA format. + +| **Terra Task name** | **Variable** | **Type** | **Description** | **Default value** | **Terra Status** | **Workflow** | +|---|---|---|---|---|---|---| +| *workflow name | **samplename** | String | Name of sample to be analyzed | | Required | FASTA, ONT, PE, SE | +| theiaprok_fasta | **assembly_fasta** | File | Assembly file in fasta format | | Required | FASTA | +| theiaprok_illumina_pe | **read1** | File | Illumina forward read file in FASTQ file format (compression optional) | | Required | PE | +| theiaprok_illumina_pe | **read2** | File | Illumina reverse read file in FASTQ file format (compression optional) | | Required | PE | +| theiaprok_illumina_se | **read1** | File | Illumina forward read file in FASTQ file format (compression optional) | | Required | SE | +| theiaprok_ont | **read1** | File | Base-called ONT read file in FASTQ file format (compression optional) | | Required | ONT | +| *workflow name | **abricate_db** | String | Database to use with the Abricate tool. Options: NCBI, CARD, ARG-ANNOT, Resfinder, MEGARES, EcOH, PlasmidFinder, Ecoli_VF and VFDB | vfdb | Optional | FASTA, ONT, PE, SE | +| *workflow name | **call_abricate** | Boolean | Set to true to enable the Abricate task | FALSE | Optional | FASTA, ONT, PE, SE | +| *workflow name | **call_ani** | Boolean | Set to true to enable the ANI task | FALSE | Optional | FASTA, ONT, PE, SE | +| *workflow name | **call_kmerfinder** | Boolean | Set to true to enable the kmerfinder task | FALSE | Optional | FASTA, ONT, PE, SE | +| *workflow name | **call_plasmidfinder** | Boolean | Set to true to enable the plasmidfinder task | TRUE | Optional | FASTA, ONT, PE, SE | +| *workflow name | **call_resfinder** | Boolean | Set to true to enable the ResFinder task | FALSE | Optional | FASTA, ONT, PE, SE | +| *workflow name | **city** | String | Will be used in the "city" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **collection_date** | String | Will be used in the "collection_date" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **county** | String | Will be used in the "county" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **expected_taxon** | String | If provided, this input will override the taxonomic assignment made by GAMBIT and launch the relevant taxon-specific submodules. It will also modify the organism flag used by AMRFinderPlus. Example format: "Salmonella enterica" | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **genome_annotation** | String | If set to "bakta", TheiaProk will use Bakta rather than Prokka to annotate the genome | prokka | Optional | FASTA, ONT, PE, SE | +| *workflow name | **genome_length** | Int | User-specified expected genome length to be used in genome statistics calculations | | Optional | ONT, PE, SE | +| *workflow name | **max_genome_length** | Int | Maximum genome length able to pass read screening. For TheiaProk_ONT, screening using max_genome_length is skipped by default. | 18040666 | Optional | ONT, PE, SE | +| *workflow name | **min_basepairs** | Int | Minimum number of base pairs able to pass read screening | 2241820 | Optional | ONT, PE, SE | +| *workflow name | **min_coverage** | Int | Minimum genome coverage able to pass read screening. Screening using min_coverage is skipped by default. | 5 | Optional | ONT | +| *workflow name | **min_coverage** | Int | Minimum genome coverage able to pass read screening | 10 | Optional | PE, SE | +| *workflow name | **min_genome_length** | Int | Minimum genome length able to pass read screening. For TheiaProk_ONT, screening using min_genome_length is skipped by default. | 100000 | Optional | ONT, PE, SE | +| *workflow name | **min_proportion** | Int | Minimum proportion of total reads in each read file to pass read screening | 40 | Optional | PE | +| *workflow name | **min_reads** | Int | Minimum number of reads to pass read screening | 5000 | Optional | ONT | +| *workflow name | **min_reads** | Int | Minimum number of reads to pass read screening | 7472 | Optional | PE, SE | +| *workflow name | **originating_lab** | String | Will be used in the "originating_lab" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **perform_characterization** | Boolean | Set to "false" if you want to only generate an assembly and relevant QC metrics and skip all characterization tasks | TRUE | Optional | FASTA, ONT, PE, SE | +| *workflow name | **qc_check_table** | File | TSV value with taxons for rows and QC values for columns; internal cells represent user-determined QC thresholds; if provided, turns on the QC Check task.
Click on the variable name for an example QC Check table | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **run_id** | String | Will be used in the "run_id" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **seq_method** | String | Will be used in the "seq_id" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **skip_mash** | Boolean | If true, skips estimation of genome size and coverage in read screening steps. As a result, providing true also prevents screening using these parameters. | TRUE | Optional | ONT, SE | +| *workflow name | **skip_screen** | Boolean | Option to skip the read screening prior to analysis | FALSE | Optional | ONT, PE, SE | +| *workflow name | **taxon_tables** | File | File indicating data table names to copy samples of a particular taxon to | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **terra_project** | String | The name of the Terra Project where you want the taxon tables written to | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **terra_workspace** | String | The name of the Terra Workspace where you want the taxon tables written to | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **trim_min_length** | Int | Specifies minimum length of each read after trimming to be kept | 25 | Optional | SE | +| *workflow name | **trim_min_length** | Int | Specifies minimum length of each read after trimming to be kept | 75 | Optional | PE | +| *workflow name | **trim_quality_min_score** | Int | Specifies the minimum average quality of bases in a sliding window to be kept | 20 | Optional | PE | +| *workflow name | **trim_quality_trim_score** | Int | Specifies the average quality of bases in a sliding window to be kept | 30 | Optional | SE | +| *workflow name | **trim_window_size** | Int | Specifies window size for trimming (the number of bases to average the quality across) | 4 | Optional | SE | +| *workflow name | **trim_window_size** | Int | Specifies window size for trimming (the number of bases to average the quality across) | 4 | Optional | PE | +| *workflow name | **zip** | String | Will be used in the "zip" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | +| abricate | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| abricate | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| abricate | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/abricate:1.0.1-abaum-plasmid | Optional | FASTA, ONT, PE, SE | +| abricate | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| abricate | **mincov** | Int | Minimum DNA %coverage for the Abricate task | 80 | Optional | FASTA, ONT, PE, SE | +| abricate | **minid** | Int | Minimum DNA %identity for the Abricate task | 80 | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **detailed_drug_class** | Boolean | If set to true, amrfinderplus_amr_classes and amrfinderplus_amr_subclasses outputs will be created | FALSE | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **disk_size** | Boolean | Amount of storage (in GB) to allocate to the AMRFinderPlus task | 50 | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ncbi-amrfinderplus:3.12.8-2024-07-22.1 | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **hide_point_mutations** | Boolean | If set to true, point mutations are not reported | FALSE | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **mincov** | Float | Minimum proportion of reference gene covered for a BLAST-based hit (Methods BLAST or PARTIAL)." Attribute should be a float ranging from 0-1, such as 0.6 (equal to 60% coverage) | 0.5 | Optional| FASTA, ONT, PE, SE | +| amrfinderplus_task | **minid** | Float | "Minimum identity for a blast-based hit hit (Methods BLAST or PARTIAL). -1 means use a curated threshold if it exists and 0.9 otherwise. Setting this value to something other than -1 will override any curated similarity cutoffs." Attribute should be a float ranging from 0-1, such as 0.95 (equal to 95% identity) | 0.9 | Optional | FASTA, ONT, PE, SE | +| amrfinderplus_task | **separate_betalactam_genes** | Boolean | Report beta-Lactam AMR genes separated out by all beta-lactam and the respective beta-lactam subclasses | FALSE | Optional | FASTA, ONT, PE, SE | +| ani | **ani_threshold** | Float | ANI value threshold must be surpassed in order to output the ani_top_species_match. If a genome does not surpass this threshold (and the percent_bases_aligned_threshold) then the ani_top_species_match output String will show a warning instead of a genus & species. | 80 | Optional | FASTA, ONT, PE, SE | +| ani | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | FASTA, ONT, PE, SE | +| ani | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| ani | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/mummer:4.0.0-rgdv2 | Optional | FASTA, ONT, PE, SE | +| ani | **mash_filter** | Float | Mash distance threshold over which ANI is not calculated | 0.9 | Optional | FASTA, ONT, PE, SE | +| ani | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| ani | **percent_bases_aligned_threshold** | Float | Threshold regarding the proportion of bases aligned between the query genome and reference genome. If a genome does not surpass this threshold (and the ani_threshold) then the ani_top_species_match output String will show a warning instead of a genus & species. | 70 | Optional | FASTA, ONT, PE, SE | +| ani | **ref_genome** | File | If not set, uses all 43 genomes in RGDv2 | | Optional | FASTA, ONT, PE, SE | +| bakta | **bakta_db** | File | Database of reference annotations (seehttps://github.com/oschwengers/bakta#database) | gs://theiagen-public-files-rp/terra/theiaprok-files/bakta_db_2022-08-29.tar.gz | Optional | FASTA, ONT, PE, SE | +| bakta | **bakta_opts** | String | Parameters to pass to bakta from https://github.com/oschwengers/bakta#usage | | Optional | FASTA, ONT, PE, SE | +| bakta | **compliant** | Boolean | If true, forces Genbank/ENA/DDJB compliance | FALSE | Optional | FASTA, ONT, PE, SE | +| bakta | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| bakta | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| bakta | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/bakta:1.5.1--pyhdfd78af_0 | Optional | FASTA, ONT, PE, SE | +| bakta | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | FASTA, ONT, PE, SE | +| bakta | **prodigal_tf** | File | Prodigal training file to use for CDS prediction by bakta | | Optional | FASTA, ONT, PE, SE | +| bakta | **proteins** | Boolean | | FALSE | Optional | FASTA, ONT, PE, SE | +| busco | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| busco | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| busco | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/ezlabgva/busco:v5.7.1_cv1 | Optional | FASTA, ONT, PE, SE | +| busco | **eukaryote** | Boolean | Assesses eukaryotic organisms, rather than prokaryotic organisms | FALSE | Optional | FASTA, ONT, PE, SE | +| busco | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| cg_pipeline_clean | **cg_pipe_opts** | String | Options to pass to CG-Pipeline for clean read assessment | --fast | Optional | PE, SE | +| cg_pipeline_clean | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | PE, SE | +| cg_pipeline_clean | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | +| cg_pipeline_clean | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f | Optional | PE, SE | +| cg_pipeline_clean | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | +| cg_pipeline_clean | **read2** | File | Internal component, do not modify | | Do not modify, Optional | SE | +| cg_pipeline_raw | **cg_pipe_opts** | String | Options to pass to CG-Pipeline for raw read assessment | --fast | Optional | PE, SE | +| cg_pipeline_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | PE, SE | +| cg_pipeline_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | +| cg_pipeline_raw | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f | Optional | PE, SE | +| cg_pipeline_raw | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | +| cg_pipeline_raw | **read2** | File | Internal component, do not modify | | Do not modify, Optional | SE | +| clean_check_reads | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT, PE, SE | +| clean_check_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT, PE, SE | +| clean_check_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/bactopia/gather_samples:2.0.2 | Optional | ONT, PE, SE | +| clean_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | +| clean_check_reads | **organism** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | +| clean_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | +| dragonflye | **assembler** | String | The assembler to use in dragonflye. Three options: raven, miniasm, flye | flye | Optional | ONT | +| dragonflye | **assembler_options** | String | Enables extra assembler options in quote | | Optional | ONT | +| dragonflye | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| dragonflye | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| dragonflye | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/dragonflye:1.0.14--hdfd78af_0 | Optional | ONT | +| dragonflye | **illumina_polishing_rounds** | Int | Number of polishing rounds to conduct with Illumina data | 1 | Optional | ONT | +| dragonflye | **illumina_read1** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | +| dragonflye | **illumina_read2** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | +| dragonflye | **medaka_model** | String | The model of medaka to use for assembly | r941_min_hac_g507 | Optional | ONT | +| dragonflye | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT | +| dragonflye | **polishing_rounds** | Int | The number of polishing rounds to conduct (without Illumina) | 1 | Optional | ONT | +| dragonflye | **use_pilon_illumina_polisher** | Boolean | Set to true to use Pilon to polish Illumina reads | FALSE | Optional | ONT | +| dragonflye | **use_racon** | Boolean | Set to true to use Racon to polish instead of Medaka | FALSE | Optional | ONT | +| export_taxon_tables | **asembly_fasta** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **bbduk_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **cg_pipeline_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **cg_pipeline_report_clean** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **cg_pipeline_report_raw** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **combined_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **combined_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **combined_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **combined_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **contigs_gfa** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | FASTA, ONT, PE, SE | +| export_taxon_tables | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| export_taxon_tables | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | FASTA, ONT, PE, SE | +| export_taxon_tables | **dragonflye_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **emmtypingtool_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **emmtypingtool_emm_type** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **emmtypingtool_results_xml** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **emmtypingtool_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **est_coverage_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **est_coverage_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **fastp_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **fastq_scan_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **hicap_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **hicap_genes** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **hicap_results_tsv** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **hicap_serotype** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **hicap_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **kmc_est_genome_length** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **kmc_kmer_stats** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **kmc_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **kraken2_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT, PE | +| export_taxon_tables | **kraken2_report** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **kraken2_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| export_taxon_tables | **midas_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **midas_primary_genus** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **midas_report** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **midas_secondary_genus** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **midas_secondary_genus_abundance** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **midas_secondary_genus_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **nanoplot_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_html_clean** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_html_raw** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_num_reads_clean1** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_num_reads_raw1** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_est_coverage_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_est_coverage_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_median_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_median_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_median_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_median_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_n50_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_n50_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_stdev_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_r1_stdev_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_tsv_clean** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_tsv_raw** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoplot_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **nanoq_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **num_reads_clean_pairs** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **num_reads_clean1** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **num_reads_clean2** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **num_reads_raw_pairs** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **num_reads_raw1** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **num_reads_raw2** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **r1_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE | +| export_taxon_tables | **r1_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **r1_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE | +| export_taxon_tables | **r1_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **r2_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **r2_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **rasusa_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **read1** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **read1_clean** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **read2** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **read2_clean** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **seroba_ariba_identity** | String | Internal component, do not modify | | Do not modify, Optional | ONT, SE | +| export_taxon_tables | **seroba_ariba_serotype** | String | Internal component, do not modify | | Do not modify, Optional | ONT, SE | +| export_taxon_tables | **seroba_details** | File | Internal component, do not modify | | Do not modify, Optional | ONT, SE | +| export_taxon_tables | **seroba_docker** | String | The Docker container to use for the task | | Do not modify, Optional | ONT, SE | +| export_taxon_tables | **seroba_serotype** | String | Internal component, do not modify | | Do not modify, Optional | ONT, SE | +| export_taxon_tables | **seroba_version** | String | Internal component, do not modify | | Do not modify, Optional | ONT, SE | +| export_taxon_tables | **shigeifinder_cluster_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_docker_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_H_antigen_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_ipaH_presence_absence_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_notes_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_num_virulence_plasmid_genes** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_O_antigen_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_report_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_serotype_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shigeifinder_version_reads** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| export_taxon_tables | **shovill_pe_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **shovill_se_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE | +| export_taxon_tables | **srst2_vibrio_biotype** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **srst2_vibrio_ctxA** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **srst2_vibrio_detailed_tsv** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **srst2_vibrio_ompW** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **srst2_vibrio_serogroup** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **srst2_vibrio_toxR** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **srst2_vibrio_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| export_taxon_tables | **theiaprok_fasta_analysis_date** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | +| export_taxon_tables | **theiaprok_fasta_version** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | +| export_taxon_tables | **theiaprok_illumina_pe_analysis_date** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **theiaprok_illumina_pe_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| export_taxon_tables | **theiaprok_illumina_se_analysis_date** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE | +| export_taxon_tables | **theiaprok_illumina_se_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE | +| export_taxon_tables | **theiaprok_ont_analysis_date** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **theiaprok_ont_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **tiptoft_plasmid_replicon_fastq** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **tiptoft_plasmid_replicon_genes** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **tiptoft_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| export_taxon_tables | **trimmomatic_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| gambit | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| gambit | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| gambit | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/gambit:1.0.0 | Optional | FASTA, ONT, PE, SE | +| gambit | **gambit_db_genomes** | File | User-provided database of assembled query genomes; requires complementary signatures file. If not provided, uses default database, "/gambit-db" | gs://gambit-databases-rp/2.0.0/gambit-metadata-2.0.0-20240628.gdb | Optional | FASTA, ONT, PE, SE | +| gambit | **gambit_db_signatures** | File | User-provided signatures file; requires complementary genomes file. If not specified, the file from the docker container will be used. | gs://gambit-databases-rp/2.0.0/gambit-signatures-2.0.0-20240628.gs | Optional | FASTA, ONT, PE, SE | +| gambit | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | FASTA, ONT, PE, SE | +| kmerfinder | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | FASTA, ONT, PE, SE | +| kmerfinder | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| kmerfinder | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/kmerfinder:3.0.2--hdfd78af_0 | Optional | FASTA, ONT, PE, SE | +| kmerfinder | **kmerfinder_args** | String | Kmerfinder additional arguments | | Optional | FASTA, ONT, PE, SE | +| kmerfinder | **kmerfinder_db** | String | Bacterial database for KmerFinder | gs://theiagen-public-files-rp/terra/theiaprok-files/kmerfinder_bacteria_20230911.tar.gz | Optional | FASTA, ONT, PE, SE | +| kmerfinder | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **abricate_abaum_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/abricate:1.0.1-abaum-plasmid | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **abricate_abaum_mincov** | Int | Minimum DNA percent coverage | | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **abricate_abaum_minid** | Int | Minimum DNA percent identity; set to 95 because there is a strict threshold of 95% identity for typing purposes | 95 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **abricate_vibrio_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/abricate:1.0.1-abaum-plasmid | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **abricate_vibrio_mincov** | Int | Minimum DNA percent coverage | 80 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **abricate_vibrio_minid** | Int | Minimum DNA percent identity | 80 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **agrvate_agr_typing_only** | Boolean | Set to true to skip agr operon extraction and frameshift detection | False | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **agrvate_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/agrvate:1.0.2--hdfd78af_0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **assembly_only** | Boolean | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | +| merlin_magic | **call_poppunk** | Boolean | If "true", runs PopPUNK for GPSC cluster designation for S. pneumoniae | TRUE | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **call_shigeifinder_reads_input** | Boolean | If set to "true", the ShigEiFinder task will run again but using read files as input instead of the assembly file. Input is shown but not used for TheiaProk_FASTA. | FALSE | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cauris_cladetyper_docker_image** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_kmer_size** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade1** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade1_annotated** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade2** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade2_annotated** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade3** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade3_annotated** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade4** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade4_annotated** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade5** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **cladetyper_ref_clade5_annotated** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **clockwork_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/cdcgov/varpipe_wgs_with_refs:2bc7234074bd53d9e92a1048b0485763cd9bbf6f4d12d5a1cc82bfec8ca7d75e | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ectyper_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/ectyper:1.0.0--pyhdfd78af_1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ectyper_hpcov** | Int | Minumum percent coverage required for an H antigen allele match | 50 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ectyper_hpid** | Int | Percent identity required for an H antigen allele match | 95 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ectyper_opcov** | Int | Minumum percent coverage required for an O antigen allele match | 90 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ectyper_opid** | Int | Percent identity required for an O antigen allele match | 90 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ectyper_print_alleles** | Boolean | Set to true to print the allele sequences as the final column | False | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ectyper_verify** | Boolean | Set to true to enable E. coli species verification | False | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **emmtypingtool_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/emmtypingtool:0.0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **genotyphi_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.11.0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **hicap_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/hicap:1.0.3--py_0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kaptive_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kaptive:2.0.3 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kaptive_low_gene_id** | Float | Percent identity threshold for what counts as a low identity match in the gene BLAST search | 95 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kaptive_min_coverage** | Float | Minimum required percent identity for the gene BLAST search via tBLASTn | 80 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kaptive_min_identity** | Float | Minimum required percent coverage for the gene BLAST search via tBLASTn | 90 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kaptive_start_end_margin** | Int | Determines flexibility in identifying the start and end of a locus - if this value is 10, a locus match that is missing the first 8 base pairs will still count as capturing the start of the locus | 10 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kleborate:2.2.0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_min_coverage** | Float | Minimum alignment percent coverage for main results | 80 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_min_identity** | Float | Minimum alignment percent identity for main results | 90 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_min_kaptive_confidence** | String | {None,Low,Good,High,Very_high,Perfect} Minimum Kaptive confidence to call K/O loci - confidence levels below this will be reported as unknown | Good | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_min_spurious_coverage** | Float | Minimum alignment percent coverage for spurious results | 40 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_min_spurious_identity** | Float | Minimum alignment percent identity for spurious results | 80 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_skip_kaptive** | Boolean | Equivalent to --kaptive_k --kaptive_ | False | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **kleborate_skip_resistance** | Boolean | Set to true to turn on resistance genes screening (default: no resistance gene screening) | False | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **legsta_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/legsta:0.5.1--hdfd78af_2 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **lissero_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/lissero:0.4.9--py_0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **lissero_min_cov** | Float | Minimum coverage of the gene to accept a match | 95 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **lissero_min_id** | Float | Minimum percent identity to accept a match | 95 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **meningotype_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/meningotype:0.8.5--pyhdfd78af_0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ngmaster_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ngmaster:1.0.0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **ont_data** | Boolean | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| merlin_magic | **paired_end** | Boolean | Internal component, do not modify | | Do not modify, Optional | ONT, PE | +| merlin_magic | **pasty_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pasty:1.0.3 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **pasty_min_coverage** | Int | Minimum coverage of a O-antigen to be considered for serogrouping by pasty | 95 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **pasty_min_pident** | Int | Minimum percent identity for a blast hit to be considered for serogrouping | 95 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **pbptyper_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pbptyper:1.0.4 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **pbptyper_min_coverage** | Int | Minimum percent coverage to count a hit | 90 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **pbptyper_min_pident** | Int | Minimum percent identity to count a hit | 90 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/poppunk:2.4.0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_clusters_csv** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_clusters.csv | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_dists_npy** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.npy | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_dists_pkl** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.pkl | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_external_clusters_csv** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_external_clusters.csv | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_fit_npz** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.npz | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_fit_pkl** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.pkl | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_graph_gt** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_graph.gt | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_h5** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.h5 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_qcreport_txt** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_qcreport.txt | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_refs** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_refs_dists_npy** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.npy | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_refs_dists_pkl** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.pkl | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_refs_graph_gt** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6refs_graph.gt | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_refs_h5** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.h5 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **poppunk_gps_unword_clusters_csv** | File | Poppunk database file *Provide an empty or local file if running TheiaProk on the command-line | gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_unword_clusters.csv | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **read1** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | +| merlin_magic | **read2** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| merlin_magic | **seqsero2_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/seqsero2:1.2.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **seroba_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/seroba:1.0.2 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **serotypefinder_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/serotypefinder:2.0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **shigatyper_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/shigatyper:2.0.5 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **shigeifinder_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/shigeifinder:1.3.5 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **sistr_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/sistr_cmd:1.1.1--pyh864c0ab_2 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **sistr_use_full_cgmlst_db** | Boolean | Set to true to use the full set of cgMLST alleles which can include highly similar alleles. By default the smaller "centroid" alleles or representative alleles are used for each marker | False | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_base_quality** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_gene_query_docker_image** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_map_qual** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_maxsoft** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_min_coverage** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_min_frac** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_min_quality** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_query_gene** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | +| merlin_magic | **snippy_reference_afumigatus** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_reference_calbicans** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_reference_cryptoneo** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **snippy_variants_docker_image** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **sonneityping_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **sonneityping_mykrobe_opts** | String | Additional options for mykrobe in sonneityping | | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **spatyper_do_enrich** | Boolean | Set to true to enable PCR product enrichment | False | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **spatyper_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/spatyper:0.3.3--pyhdfd78af_3 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **srst2_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/srst2:0.2.0-vcholerae | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **srst2_gene_max_mismatch** | Int | Maximum number of mismatches for SRST2 to call a gene as present | 2000 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **srst2_max_divergence** | Int | Maximum divergence, in percentage, for SRST2 to call a gene as present | 20 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **srst2_min_cov** | Int | Minimum breadth of coverage for SRST2 to call a gene as present | 80 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **srst2_min_depth** | Int | Minimum depth of coverage for SRST2 to call a gene as present | 5 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **srst2_min_edge_depth** | Int | Minimum edge depth for SRST2 to call a gene as present | 2 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **staphopia_sccmec_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_coverage_regions_bed** | File | A bed file that lists the regions to be considered for QC | | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_coverage_threshold** | Int | The minimum coverage for a region to pass QC in tbp_parser | 100 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_debug** | Boolean | Activate the debug mode on tbp_parser; increases logging outputs | FALSE | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.6.0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.4.0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_min_depth** | Int | Minimum depth for a variant to pass QC in tbp_parser | 10 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_min_frequency** | Int | The minimum frequency for a mutation to pass QC | 0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_min_read_support** | Int | The minimum read support for a mutation to pass QC | 10 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_operator** | String | Fills the "operator" field in the tbp_parser output files | Operator not provided | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_output_seq_method_type** | String | Fills out the "seq_method" field in the tbp_parser output files | Sequencing method not provided | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_additional_outputs** | Boolean | If set to "true", activates the tbp_parser module and results in more outputs, including tbp_parser_looker_report_csv, tbp_parser_laboratorian_report_csv, tbp_parser_lims_report_csv, tbp_parser_coverage_report, and tbp_parser_genome_percent_coverage | FALSE | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_cov_frac_threshold** | Int | A cutoff used to calculate the fraction of the region covered by ≤ this value | 1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_custom_db** | File | TBProfiler uses by default the TBDB database; if you have a custom database you wish to use, you must provide a custom database in this field and set tbprofiler_run_custom_db to true | | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/tbprofiler:4.4.2 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_mapper** | String | The mapping tool used in TBProfiler to align the reads to the reference genome; see TBProfiler’s original documentation for available options. | bwa | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_min_af** | Float | The minimum allele frequency to call a variant | 0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_min_af_pred** | Float | The minimum allele frequency to use a variant for resistance prediction | 0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_min_depth** | Int | The minimum depth for a variant to be called. | 10 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_run_custom_db** | Boolean | TBProfiler uses by default the TBDB database; if you have a custom database you wish to use, you must set this value to true and provide a custom database in the tbprofiler_custom_db field | FALSE | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_variant_caller** | String | Select a different variant caller for TBProfiler to use by writing it in this block; see TBProfiler’s original documentation for available options. | freebayes | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_variant_calling_params** | String | Enter additional variant calling parameters in this free text input to customize how the variant caller works in TBProfiler | None | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **theiaeuk** | Boolean | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| merlin_magic | **virulencefinder_coverage_threshold** | Float | The threshold for minimum coverage | | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **virulencefinder_database** | String | The specific database to use | virulence_ecoli | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **virulencefinder_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/virulencefinder:2.0.4 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **virulencefinder_identity_threshold** | Float | The threshold for minimum blast identity | | Optional | FASTA, ONT, PE, SE | +| nanoplot_clean | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| nanoplot_clean | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| nanoplot_clean | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/nanoplot:1.40.0 | Optional | ONT | +| nanoplot_clean | **max_length** | Int | Maximum read length for nanoplot | 100000 | Optional | ONT | +| nanoplot_clean | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| nanoplot_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| nanoplot_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| nanoplot_raw | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/nanoplot:1.40.0 | Optional | ONT | +| nanoplot_raw | **max_length** | Int | Maximum read length for nanoplot | 100000 | Optional | ONT | +| nanoplot_raw | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| plasmidfinder | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **database** | String | User-specified database | | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **database_path** | String | Path to user-specified database | | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/plasmidfinder:2.1.6 | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **method_path** | String | Path to files for a user-specified method to use (blast or kma) | | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **min_cov** | Float | Threshold for minimum coverage, default threshold from PlasmidFinder CLI tool is used (0.60) | 0.6 | Optional | FASTA, ONT, PE, SE | +| plasmidfinder | **threshold** | Float | Threshold for mininum blast identity, default threshold from PlasmidFinder CLI tool is used (0.90). This default differs from the default of the PlasmidFinder webtool (0.95) | 0.9 | Optional | FASTA, ONT, PE, SE | +| prokka | **compliant** | Boolean | Forces Genbank/ENA/DDJB compliant headers in Prokka output files | TRUE | Optional | FASTA, ONT, PE, SE | +| prokka | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| prokka | **disk_size** | String | Amount of storage (in GB) to allocate to the PlasmidFinder task | 100 | Optional | FASTA, ONT, PE, SE | +| prokka | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/prokka:1.14.5 | Optional | FASTA, ONT, PE, SE | +| prokka | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | FASTA, ONT, PE, SE | +| prokka | **prodigal_tf** | File | https://github.com/tseemann/prokka#option---prodigaltf | | Optional | FASTA, ONT, PE, SE | +| prokka | **prokka_arguments** | String | Any additional https://github.com/tseemann/prokka#command-line-options | | Optional | FASTA, ONT, PE, SE | +| prokka | **proteins** | Boolean | FASTA file of trusted proteins for Prokka to first use for annotations | FALSE | Optional | FASTA, ONT, PE, SE | +| qc_check_task | **assembly_length_unambiguous** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **assembly_mean_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **combined_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **combined_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **combined_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **combined_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | FASTA, ONT, PE, SE | +| qc_check_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| qc_check_task | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" | Optional | FASTA, ONT, PE, SE | +| qc_check_task | **est_coverage_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **est_coverage_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **kraken_human** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken_human_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **meanbaseq_trim** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| qc_check_task | **midas_secondary_genus_abundance** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| qc_check_task | **midas_secondary_genus_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | +| qc_check_task | **num_reads_clean1** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **num_reads_clean2** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **num_reads_raw1** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **num_reads_raw2** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **number_Degenerate** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **number_N** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **percent_reference_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **r1_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **r1_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **r1_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **r1_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | +| qc_check_task | **r2_mean_q_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **r2_mean_q_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **r2_mean_readlength_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **r2_mean_readlength_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | +| qc_check_task | **sc2_s_gene_mean_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **sc2_s_gene_percent_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **vadr_num_alerts** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| quast | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| quast | **disk_size** | String | Amount of storage (in GB) to allocate to the Quast task | 100 | Optional | FASTA, ONT, PE, SE | +| quast | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/quast:5.0.2 | Optional | FASTA, ONT, PE, SE | +| quast | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| quast | **min_contig_length** | Int | Lower threshold for a contig length in bp. Shorter contigs won’t be taken into account | 500 | Optional | FASTA, ONT, PE, SE | +| raw_check_reads | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT, PE, SE | +| raw_check_reads | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT, PE, SE | +| raw_check_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/bactopia/gather_samples:2.0.2 | Optional | ONT, PE, SE | +| raw_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | +| raw_check_reads | **organism** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | +| raw_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | +| read_QC_trim | **adapters** | File | A file containing the sequence of the adapters used during library preparation, used in the BBDuk task | | Optional | PE, SE | +| read_QC_trim | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | +| read_QC_trim | **call_kraken** | Boolean | Set to true to launch Kraken2; if true, you must provide a kraken_db | FALSE | Optional | ONT, PE, SE | +| read_QC_trim | **call_midas** | Boolean | Set to true to launch Midas | TRUE | Optional | PE, SE | +| read_QC_trim | **downsampling_coverage** | Float | The depth to downsample to with Rasusa | 150 | Optional | ONT | +| read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | -g -5 20 -3 20 | Optional | SE | +| read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | "--detect_adapter_for_pe -g -5 20 -3 20 | Optional | PE | +| read_QC_trim | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT, PE, SE | +| read_QC_trim | **kraken_db** | File | Kraken2 database file; must be provided in call_kraken is true | | Optional | ONT, PE, SE | +| read_QC_trim | **kraken_disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | ONT, PE, SE | +| read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | ONT, PE, SE | +| read_QC_trim | **max_length** | Int | Internal component, do not modify | | Do not modify, Optional | ONT | +| read_QC_trim | **midas_db** | File | Midas database file | gs://theiagen-large-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | PE, SE | +| read_QC_trim | **min_length** | Int | Internal component, do not modify | | Do not modify, Optional | ONT | +| read_QC_trim | **phix** | File | A file containing the phix used during Illumina sequencing; used in the BBDuk task | | Optional | PE, SE | +| read_QC_trim | **read_processing** | String | Read trimming software to use, either "trimmomatic" or "fastp" | trimmomatic | Optional | PE, SE | +| read_QC_trim | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | fastq_scan | Optional | PE, SE | +| read_QC_trim | **run_prefix** | String | Internal component, do not modify | | Do not modify, Optional | ONT | +| read_QC_trim | **target_organism** | String | This string is searched for in the kraken2 outputs to extract the read percentage | | Optional | ONT, PE, SE | +| read_QC_trim | **trimmomatic_args** | String | Additional arguments to pass to trimmomatic. "-phred33" specifies the Phred Q score encoding which is almost always phred33 with modern sequence data. | -phred33 | Optional | PE, SE | +| resfinder_task | **acquired** | Boolean | Set to true to tell ResFinder to identify acquired resistance genes | TRUE | Optional | FASTA, ONT, PE, SE | +| resfinder_task | **call_pointfinder** | Boolean | Set to true to enable detection of point mutations. | FALSE | Optional | FASTA, ONT, PE, SE | +| resfinder_task | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| resfinder_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | +| resfinder_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/resfinder:4.1.11 | Optional | FASTA, ONT, PE, SE | +| resfinder_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | +| resfinder_task | **min_cov** | Float | Minimum coverage breadth of a gene for it to be identified | 0.5 | Optional | FASTA, ONT, PE, SE | +| resfinder_task | **min_id** | Float | Minimum identity for ResFinder to identify a gene | 0.9 | Optional | FASTA, ONT, PE, SE | +| shovill_pe | **assembler** | String | Assembler to use (spades, skesa, velvet or megahit), see https://github.com/tseemann/shovill#--assembler | skesa | Optional | PE | +| shovill_pe | **assembler_options** | String | Assembler-specific options that you might choose, see https://github.com/tseemann/shovill#--opts | | Optional | PE | +| shovill_pe | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | PE | +| shovill_pe | **depth** | Int | User specified depth of coverage for downsampling (see https://github.com/tseemann/shovill#--depth and https://github.com/tseemann/shovill#main-steps) | 150 | Optional | PE | +| shovill_pe | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE | +| shovill_pe | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/shovill:1.1.0 | Optional | PE | +| shovill_pe | **kmers** | String | User-specified Kmer length to override choice made by Shovill, see https://github.com/tseemann/shovill#--kmers | Auto | Optional | PE | +| shovill_pe | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | PE | +| shovill_pe | **min_contig_length** | Int | Minimum contig length to keep in final assembly | 200 | Optional | PE | +| shovill_pe | **min_coverage** | Float | Minimum contig coverage to keep in final assembly | 2 | Optional | PE | +| shovill_pe | **nocorr** | Boolean | Disable correction of minor assembly errors by Shovill (see https://github.com/tseemann/shovill#main-steps) | FALSE | Optional | PE | +| shovill_pe | **noreadcorr** | Boolean | Disable correction of sequencing errors in reads by Shovill (seehttps://github.com/tseemann/shovill#main-steps) | FALSE | Optional | PE | +| shovill_pe | **nostitch** | Boolean | Disable read stitching by Shovill (see https://github.com/tseemann/shovill#main-steps) | FALSE | Optional | PE | +| shovill_pe | **trim** | Boolean | Enable adaptor trimming (see https://github.com/tseemann/shovill#main-steps) | FALSE | Optional | PE | +| shovill_se | **assembler** | String | Assembler to use (spades, skesa, velvet or megahit), see https://github.com/tseemann/shovill#--assembler | skesa | Optional | SE | +| shovill_se | **assembler_options** | String | Assembler-specific options that you might choose, see https://github.com/tseemann/shovill#--opts | | Optional | SE | +| shovill_se | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | SE | +| shovill_se | **depth** | Int | User specified depth of coverage for downsampling (see https://github.com/tseemann/shovill#--depth and https://github.com/tseemann/shovill#main-steps) | 150 | Optional | SE | +| shovill_se | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | SE | +| shovill_se | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/shovill:1.1.0 | Optional | SE | +| shovill_se | **kmers** | String | User-specified Kmer length to override choice made by Shovill, see https://github.com/tseemann/shovill#--kmers | auto | Optional | SE | +| shovill_se | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | SE | +| shovill_se | **min_contig_length** | Int | Minimum contig length to keep in final assembly | 200 | Optional | SE | +| shovill_se | **min_coverage** | Float | Minimum contig coverage to keep in final assembly | 2 | Optional | SE | +| shovill_se | **nocorr** | Boolean | Disable correction of minor assembly errors by Shovill (see https://github.com/tseemann/shovill#main-steps) | FALSE | Optional | SE | +| shovill_se | **noreadcorr** | Boolean | Disable correction of sequencing errors in reads by Shovill (seehttps://github.com/tseemann/shovill#main-steps) | FALSE | Optional | SE | +| shovill_se | **trim** | Boolean | Enable adaptor trimming (see https://github.com/tseemann/shovill#main-steps) | FALSE | Optional | SE | +| ts_mlst | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mlst:2.23.0-2024-08-01 | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **mincov** | Float | Minimum % breadth of coverage to report an MLST allele | 10 | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **minid** | Float | Minimum % identity to known MLST gene to report an MLST allele | 95 | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **minscore** | Float | Minimum https://github.com/tseemann/mlst#scoring-system to assign an MLST profile | 50 | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **nopath** | Boolean | true = use mlst --nopath. If set to false, filename paths are not stripped from FILE column in output TSV | TRUE | Optional | FASTA, ONT, PE, SE | +| ts_mlst | **scheme** | String | Don’t autodetect the MLST scheme; force this scheme on all inputs (see https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21 for accepted strings) | None | Optional | FASTA, ONT, PE, SE | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | FASTA, ONT, PE, SE | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | FASTA, ONT, PE, SE | + +!!! tip "Skip Characterization" + Ever wanted to skip characterization? Now you can! Set the optional input `perform_characterization` to **`false`** to only generate an assembly and run assembly QC. + +### Core Tasks (performed for all taxa) + +??? task "`versioning`: Version Capture for TheiaProk" + + The `versioning` task captures the workflow version from the GitHub (code repository) version. + + !!! techdetails "Version Capture Technical details" + + | | Links | + | --- | --- | + | Task | [task_versioning.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/task_versioning.wdl) | + +??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation" + + The [`screen`](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow: + + 1. Total number of reads: A sample will fail the read screening task if its total number of reads is less than or equal to `min_reads`. + 2. The proportion of basepairs reads in the forward and reverse read files: A sample will fail the read screening if fewer than `min_proportion` basepairs are in either the reads1 or read2 files. + 3. Number of basepairs: A sample will fail the read screening if there are fewer than `min_basepairs` basepairs + 4. Estimated genome size: A sample will fail the read screening if the estimated genome size is smaller than `min_genome_size` or bigger than `max_genome_size`. + 5. Estimated genome coverage: A sample will fail the read screening if the estimated genome coverage is less than the `min_coverage`. + + Read screening is undertaken on both the raw and cleaned reads. The task may be skipped by setting the `skip_screen` variable to true. + + Default values vary between the PE and SE workflow. The rationale for these default values can be found below. If two default values are shown, the first is for Illumina workflows and the second is for ONT. + + | Variable | Default Value | Rationale | + | --- | --- | --- | + | `skip_screen` | false | Set to false to avoid waste of compute resources processing insufficient data | + | `min_reads` | 7472 or 5000 | Calculated from the minimum number of base pairs required for 20x coverage of Nasuia deltocephalinicola genome, the smallest known bacterial genome as of 2019-08-07 (112,091 bp), divided by 300 (the longest Illumina read length) or 5000 (estimate of ONT read length) | + | `min_basepairs` | 2241820 | Should be greater than 20x coverage of Nasuia deltocephalinicola, the smallest known bacterial genome (112,091 bp) | + | `min_genome_length` | 100000 | Based on the Nasuia deltocephalinicola genome - the smallest known bacterial genome (112,091 bp) | + | `max_genome_length` | 18040666 | Based on the Minicystis rosea genome, the biggest known bacterial genome (16,040,666 bp), plus an additional 2 Mbp to cater for potential extra genomic material | + | `min_coverage` | 10 or 5 | A bare-minimum average per base coverage across the genome required for genome characterization. Note, a higher per base coverage coverage would be required for high-quality phylogenetics. | + | `min_proportion` | 40 | Neither read1 nor read2 files should have less than 40% of the total number of reads. For paired-end data only | + + !!! techdetails "Screen Technical Details" + There is a single WDL task for read screening that contains two separate sub-tasks, one used for PE data and the other for SE data. The `screen` task is run twice, once for raw reads and once for clean reads. + + | | TheiaProk_Illumina_PE | TheiaProk_Illumina_SE and TheiaProk_ONT | + | --- | --- | --- | + | Task | [task_screen.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_screen.wdl#L3) (PE sub-task) | [task_screen.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_screen.wdl#L147) (SE sub-task) | + +#### Illumina Data Core Tasks + +??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" + + `read_QC_trim` is a sub-workflow within TheiaMeta that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. + + **Read quality trimming** + + Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_minlen`. + + If fastp is selected for analysis, fastp also implements the additional read-trimming steps indicated below: + + | **Parameter** | **Explanation** | + | --- | --- | + | -g | enables polyG tail trimming | + | -5 20 | enables read end-trimming | + | -3 20 | enables read end-trimming | + | --detect_adapter_for_pe | enables adapter-trimming **only for paired-end reads** | + + **Adapter removal** + + The `BBDuk` task removes adapters from sequence reads. To do this: + + - [Repair](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/) from the [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) package reorders reads in paired fastq files to ensure the forward and reverse reads of a pair are in the same position in the two fastq files. + - [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) (*"Bestus Bioinformaticus" Decontamination Using Kmers*) is then used to trim the adapters and filter out all reads that have a 31-mer match to [PhiX](https://emea.illumina.com/products/by-type/sequencing-kits/cluster-gen-sequencing-reagents/phix-control-v3.html), which is commonly added to Illumina sequencing runs to monitor and/or improve overall run quality. + + ??? toggle "What are adapters and why do they need to be removed?" + Adapters are manufactured oligonucleotide sequences attached to DNA fragments during the library preparation process. In Illumina sequencing, these adapter sequences are required for attaching reads to flow cells. You can read more about Illumina adapters [here](https://emea.support.illumina.com/bulletins/2020/06/illumina-adapter-portfolio.html). For genome analysis, it's important to remove these sequences since they're not actually from your sample. If you don't remove them, the downstream analysis may be affected. + + **Read Quantification** + + There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In TheiaProk_Illumina_PE, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality. + + **Read Identification (optional)** + + The `MIDAS` task is for the identification of reads to detect contamination with non-target taxa. This task is optional and turned off by default. It can be used by setting the `call_midas` input variable to `true`. + + The MIDAS tool was originally designed for metagenomic sequencing data but has been co-opted for use with bacterial isolate WGS methods. It can be used to detect contamination present in raw sequencing data by estimating bacterial species abundance in bacterial isolate WGS data. If a secondary genus is detected above a relative frequency of 0.01 (1%), then the sample should fail QC and be investigated further for potential contamination. + + This task is similar to those used in commercial software, BioNumerics, for estimating secondary species abundance. + + ??? toggle "How are the MIDAS output columns determined?" + + Example MIDAS report in the `midas_report` column: + + | species_id | count_reads | coverage | relative_abundance | + | --- | --- | --- | --- | + | Salmonella_enterica_58156 | 3309 | 89.88006645 | 0.855888033 | + | Salmonella_enterica_58266 | 501 | 11.60606061 | 0.110519371 | + | Salmonella_enterica_53987 | 99 | 2.232896237 | 0.021262881 | + | Citrobacter_youngae_61659 | 46 | 0.995216227 | 0.009477003 | + | Escherichia_coli_58110 | 5 | 0.123668877 | 0.001177644 | + + MIDAS report column descriptions: + + - species_id: species identifier + - count_reads: number of reads mapped to marker genes + - coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome + - relative_abundance: estimated relative abundance of species in metagenome + + The value in the `midas_primary_genus` column is derived by ordering the rows in order of "relative_abundance" and identifying the genus of top species in the "species_id" column (Salmonella). The value in the `midas_secondary_genus` column is derived from the genus of the second-most prevalent genus in the "species_id" column (Citrobacter). The `midas_secondary_genus_abundance` column is the "relative_abundance" of the second-most prevalent genus (0.009477003). The `midas_secondary_genus_coverage` is the "coverage" of the second-most prevalent genus (0.995216227). + + Alternatively to `MIDAS`, the `Kraken2` task can also be turned on through setting the `call_kraken` input variable as `true` for the identification of reads to detect contamination with non-target taxa. + + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate) whole genome sequence data. A database must be provided if this optional module is activated, through the kraken_db optional input. A list of suggested databases can be found on [Kraken2 standalone documentation](../standalone/kraken2.md). + + !!! techdetails "read_QC_trim Technical Details" + + | | Links | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/workflows/wf_read_QC_trim.wdl) | + | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| + | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | + | Original Publication(s) | *[Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
*[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
*[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
*[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +??? task "`CG-Pipeline`: Assessment of Read Quality, and Estimation of Genome Coverage" + + The`cg_pipeline` task generates metrics about read quality and estimates the coverage of the genome using the "run_assembly_readMetrics.pl" script from [CG-Pipeline](https://github.com/lskatz/CG-Pipeline/). The genome coverage estimates are calculated using both using raw and cleaned reads, using either a user-provided `genome_size` or the estimated genome length generated by QUAST. + + !!! techdetails "CG-Pipeline Technical Details" + The `cg_pipeline` task is run twice in TheiaProk, once with raw reads, and once with clean reads. + + | | Links | + | --- | --- | + | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_cg_pipeline.wdl) | + | Software Source Code | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | + | Software Documentation | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | + | Original Publication(s) | [A computational genomics pipeline for prokaryotic sequencing projects](https://academic.oup.com/bioinformatics/article/26/15/1819/188418) | + +??? task "`shovill`: _De novo_ Assembly" + + De Novo assembly will be undertaken only for samples that have sufficient read quantity and quality, as determined by the `screen` task assessment of clean reads. + + In TheiaEuk, assembly is performed using the [Shovill](https://github.com/tseemann/shovill) pipeline. This undertakes the assembly with one of four assemblers ([SKESA](https://github.com/ncbi/SKESA) (default), [SPAdes](https://github.com/ablab/spades), [Velvet](https://github.com/dzerbino/velvet/), [Megahit](https://github.com/voutcn/megahit)), but also performs [a number of pre- and post-processing steps](https://github.com/tseemann/shovill#main-steps) to improve the resulting genome assembly. Shovill uses an estimated genome size (see [here](https://github.com/tseemann/shovill#--gsize)). If this is not provided by the user as an optional input, Shovill will estimate the genome size using [mash](https://mash.readthedocs.io/en/latest/index.html). Adaptor trimming can be undertaken with Shovill by setting the `trim` option to "true", but this is set to "false" by default as [alternative adapter trimming](https://www.notion.so/TheiaProk-Workflow-Series-89b9c08406094ec78d08a578fe861626?pvs=21) is undertaken in the TheiaEuk workflow. + + ??? toggle "What is _de novo_ assembly?" + _De novo_ assembly is the process or product of attempting to reconstruct a genome from scratch (without prior knowledge of the genome) using sequence reads. Assembly of fungal genomes from short-reads will produce multiple contigs per chromosome rather than a single contiguous sequence for each chromosome. + + !!! techdetails "Shovill Technical Details" + | | Links | + | --- | --- | + | TheiaProk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/assembly/task_shovill.wdl#L3) | + | Software code repository and documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | + +#### ONT Data Core Tasks + +??? task "`read_QC_trim_ont`: Read Quality Trimming, Quantification, and Identification" + + `read_QC_trim_ont` is a sub-workflow within TheiaProk_ONT that filters low-quality reads and trims low-quality regions of reads. It uses several tasks, described below. + + **Estimated genome length**: + + By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `RASUSA` and `dragonflye`. + + **Plotting and quantifying long-read sequencing data:** `nanoplot` + + Nanoplot is used for the determination of mean quality scores, read lengths, and number of reads. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. + + **Read subsampling:** Samples are automatically randomly subsampled to 150X coverage using `RASUSA`. + + **Plasmid prediction:** `tiptoft` is used to predict plasmid sequences directly from uncorrected long-read data. Plasmids are identified using replicon sequences used for typing from [PlasmidFinder](https://cge.food.dtu.dk/services/PlasmidFinder/). + + **Read filtering:** Reads are filtered by length and quality using `nanoq`. By default, sequences with less than 500 basepairs and quality score lower than 10 are filtered out to improve assembly accuracy. + + !!! techdetails "read_QC_trim_ont Technical Details" + + TheiaProk_ONT calls a sub-workflow listed below, which then calls the individual tasks: + + | Workflow | **TheiaProk_ONT** | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim_ont.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_ont.wdl) | + | Tasks | [task_nanoplot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_nanoplot.wdl) [task_fastq_scan.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/b481ce48f3d527ab8f31e4ad8171769212cc091a/tasks/quality_control/task_fastq_scan.wdl#L3) [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) [task_nanoq.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_nanoq.wdl) [task_tiptoft.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/task_tiptoft.wdl) | + | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan), [NanoPlot](https://github.com/wdecoster/NanoPlot), [RASUSA](https://github.com/mbhall88/rasusa), [tiptoft](https://github.com/andrewjpage/tiptoft), [nanoq](https://github.com/esteinig/nanoq) | + | Original Publication(s) | [NanoPlot paper](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911)
[RASUSA paper](https://doi.org/10.21105/joss.03941)
[Nanoq Paper](https://doi.org/10.21105/joss.02991)
[Tiptoft paper](https://doi.org/10.21105/joss.01021) | + +??? task "`dragonflye`: _De novo_ Assembly" + !!! techdetails "dragonflye Technical Details" + | | Links | + | --- | --- | + | Task | [task_dragonflye.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/assembly/task_dragonflye.wdl) | + | Software Source Code | [dragonflye on GitHub](https://github.com/rpetit3/dragonflye) | + | Software Documentation | [dragonflye on GitHub](https://github.com/rpetit3/dragonflye) | + +#### Post-Assembly Tasks (performed for all taxa) + +??? task "`quast`: Assembly Quality Assessment" + + QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome assemblies by computing various metrics without a reference being necessary. It includes useful metrics such as number of contigs, length of the largest contig and N50. + + !!! techdetails "QUAST Technical Details" + + | | Links | + | --- | --- | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | + | Software Documentation | | + | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | + +??? task "`BUSCO`: Assembly Quality Assessment" + + BUSCO (**B**enchmarking **U**niversal **S**ingle-**C**opy **O**rthologue) attempts to quantify the completeness and contamination of an assembly to generate quality assessment metrics. It uses taxa-specific databases containing genes that are all expected to occur in the given taxa, each in a single copy. BUSCO examines the presence or absence of these genes, whether they are fragmented, and whether they are duplicated (suggestive that additional copies came from contaminants). + + **BUSCO notation** + + Here is an example of BUSCO notation: `C:99.1%[S:98.9%,D:0.2%],F:0.0%,M:0.9%,n:440`. There are several abbreviations used in this output: + + - Complete (C) - genes are considered "complete" when their lengths are within two standard deviations of the BUSCO group mean length. + - Single-copy (S) - genes that are complete and have only one copy. + - Duplicated (D) - genes that are complete and have more than one copy. + - Fragmented (F) - genes that are only partially recovered. + - Missing (M) - genes that were not recovered at all. + - Number of genes examined (n) - the number of genes examined. + + A high equity assembly will use the appropriate database for the taxa, have high complete (C) and single-copy (S) percentages, and low duplicated (D), fragmented (F) and missing (M) percentages. + + !!! techdetails "BUSCO Technical Details" + + | | Links | + | --- | --- | + | Task | [task_busco.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/advanced_metrics/task_busco.wdl) | + | Software Source Code | [BUSCO on GitLab](https://gitlab.com/ezlab/busco) | + | Software Documentation | https://busco.ezlab.org/ | + | Orginal publication | [BUSCO: assessing genome assembly and annotation completeness with single-copy orthologs](https://academic.oup.com/bioinformatics/article/31/19/3210/211866) | + +??? task "`MUMmer_ANI`: Average Nucleotide Identity (optional)" + + Average Nucleotide Identity (ANI) is a useful approach for taxonomic identification. The higher the percentage ANI of a query sequence to a given reference genome, the more likely the sequence is the same taxa as the reference. + + ANI is calculated in TheiaProk using [a perl script written by Lee Katz](https://github.com/lskatz/ani-m) ([ani-m.pl](http://ani-m.pl)). This uses [MUMmer](http://mummer.sourceforge.net/) to rapidly align entire query assemblies to one or more reference genomes. By default, TheiaProk uses a set of 43 reference genomes in [RGDv2](https://github.com/StaPH-B/docker-builds/blob/master/fastani/1.34-RGDV2/RGDv2-metadata.tsv), a database containing genomes of enteric pathogens commonly sequenced by CDC EDLB & PulseNet participating laboratories. The user may also provide their own reference genome. After genome alignment with MUMmer, ani-m.pl calculates the average nucleotide identity and percent bases aligned between 2 genomes (query and reference genomes) + + The default database of reference genomes used is called "Reference Genome Database version 2" AKA "RGDv2". This database is composed of 43 enteric bacteria representing 32 species and is intended for identification of enteric pathogens and common contaminants. It contains six Campylobacter spp., three Escherichia/Shigella spp., one *Grimontia hollisae*, six *Listeria spp.*, one *Photobacterium damselae*, two *Salmonella spp.*, and thirteen *Vibrio spp.* + + 2 Thresholds are utilized to prevent false positive hits. The `ani_top_species_match` will only report a genus & species match if both thresholds are surpassed. Both of these thresholds are set to match those used in BioNumerics for PulseNet organisms. + + 1. `ani_threshold` default value of 80.0 + 2. `percent_bases_aligned_threshold` default value of 70.0 + + For more information on RGDv2 database of reference genomes, please see [the publication here.](https://www.frontiersin.org/articles/10.3389/fmicb.2023.1225207/full) + + !!! techdetails "MUMmer_ANI Technical Details" + + | | Links | + | --- | --- | + | Task | [task_mummer_ani.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_mummer_ani.wdl) | + | Software Source Code | [ani-m](https://github.com/lskatz/ani-m), [MUMmer](https://github.com/mummer4/mummer) | + | Software Documentation | [ani-m](https://github.com/lskatz/ani-m), [MUMmer](https://mummer.sourceforge.net/) | + | Original Publication(s) | [MUMmer4: A fast and versatile genome alignment system](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005944) | + | Publication about RGDv2 database | https://www.frontiersin.org/articles/10.3389/fmicb.2023.1225207/full | + +??? task "`GAMBIT`: **Taxon Assignment**" + + [`GAMBIT`](https://github.com/jlumpe/gambit) determines the taxon of the genome assembly using a k-mer based approach to match the assembly sequence to the closest complete genome in a database, thereby predicting its identity. Sometimes, GAMBIT can confidently designate the organism to the species level. Other times, it is more conservative and assigns it to a higher taxonomic rank. + + For additional details regarding the GAMBIT tool and a list of available GAMBIT databases for analysis, please consult the [GAMBIT](https://www.notion.so/GAMBIT-7c1376b861d0486abfbc316480046bdc?pvs=21) tool documentation. + + !!! techdetails "GAMBIT Technical Details" + + | | Links | + | --- | --- | + | Task | [task_gambit.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_gambit.wdl) | + | Software Source Code | [GAMBIT on GitHub](https://github.com/jlumpe/gambit) | + | Software Documentation | [GAMBIT ReadTheDocs](https://gambit-genomics.readthedocs.io/en/latest/) | + | Original Publication(s) | [GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0277575) | + +??? task "`KmerFinder`: Taxon Assignment (optional)" + + The `KmerFinder` method predicts prokaryotic species based on the number of overlapping (co-occurring) *k*-mers, i.e., 16-mers, between the query genome and genomes in a reference database. + + !!! techdetails "KmerFinder Technical Details" + + | | Links | + | --- | --- | + | Task | [task_kmerfinder.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kmerfinder.wdl) | + | Software Source Code | https://bitbucket.org/genomicepidemiology/kmerfinder | + | Software Documentation | https://cge.food.dtu.dk/services/KmerFinder/instructions.php | + | Original Publication(s) | [**Benchmarking of Methods for Genomic Taxonomy**](https://journals.asm.org/doi/full/10.1128/jcm.02981-13?rfr_dat=cr_pub++0pubmed&url_ver=Z39.88-2003&rfr_id=ori%3Arid%3Acrossref.org) | + +??? task "`AMRFinderPlus`: AMR Genotyping (default)" + + NCBI's [AMRFinderPlus](https://github.com/ncbi/amr/wiki) is the default antimicrobial resistance (AMR) detection tool used in TheiaProk. [ResFinder](https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) may be used alternatively and if so, AMRFinderPlus is not run. + + AMRFinderPlus identifies acquired antimicrobial resistance (AMR) genes, virulence genes, and stress genes. Such AMR genes confer resistance to antibiotics, metals, biocides, heat, or acid. For some taxa (see [here](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option)), AMRFinderPlus will provide taxa-specific results including filtering out genes that are almost ubiquitous in the taxa (intrinsic genes) and identifying resistance-associated point mutations. In TheiaProk, the taxon used by AMRFinderPlus is specified based on the `gambit_predicted_taxon` or a user-provided `expected_taxon`. + + You can check if a gene or point mutation is in the AMRFinderPlus database [here](https://www.ncbi.nlm.nih.gov/pathogens/refgene/#), find the sequences of reference genes [here](https://www.ncbi.nlm.nih.gov/bioproject/PRJNA313047), and search the query Hidden Markov Models (HMMs) used by AMRFinderPlus to identify AMR genes and some stress and virulence proteins ([here](https://www.ncbi.nlm.nih.gov/pathogens/hmm/)). The AMRFinderPlus database is updated frequently. You can ensure you are using the most up-to-date version by specifying the docker image as a workflow input. You might like to save this docker image as a workspace data element to make this easier. + + !!! techdetails "AMRFinderPlus Technical Details" + + | | Links | + | --- | --- | + | Task | [task_amrfinderplus.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/gene_typing/task_amrfinderplus.wdl) | + | Software Source Code | [amr on GitHub](https://github.com/ncbi/amr) | + | Software Documentation | https://github.com/ncbi/amr/wiki | + | Original Publication(s) | [AMRFinderPlus and the Reference Gene Catalog facilitate examination of the genomic links among antimicrobial resistance, stress response, and virulence](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8208984/) | + +??? task "`ResFinder`: AMR Genotyping & Shigella XDR phenotype prediction (alternative)" + + The `ResFinder` task is an alternative to using AMRFinderPlus for detection and identification of AMR genes and resistance-associated mutations. + + This task runs the Centre for Genomic Epidemiology (CGE) ResFinder tool to identify acquired antimicrobial resistance. It can also run the CGE PointFinder tool if the `call_pointfinder` variable is set with to `true`. The databases underlying the task are different to those used by AMRFinderPlus. + + The default thresholds for calling AMR genes are 90% identity and 50% coverage of the reference genes (expressed as a fraction in workflow inputs: 0.9 & 0.5). These are the same thresholds utilized in BioNumerics for calling AMR genes. + + Organisms currently support by PointFinder for mutational-based predicted resistance: + + - Campylobacter coli & C. jejuni + - Enterococcus faecalis + - Enterococcus faecium + - Escherichia coli & Shigella spp. + - Helicobacter pylori + - Neisseria gonorrhoeae + - Klebsiella + - Mycobacterium tuberculosis + - Salmonella spp. + - Staphylococcus aureus + + **XDR Shigella prediction** + + The `ResFinder` Task also has the ability to predict whether or not a sample meets the CDC's definition for extensively drug-resistant (XDR) Shigella. + + > *CDC defines XDR Shigella bacteria as strains that are resistant to all commonly recommended empiric and alternative antibiotics — azithromycin, ciprofloxacin, ceftriaxone, trimethoprim-sulfamethoxazole (TMP-SMX), and ampicillin. [Link to CDC HAN](https://emergency.cdc.gov/han/2023/han00486.asp) where this definition is found.* + + A sample is required to meet **all 7 criteria** in order to be predicted as `XDR Shigella` + + 1. The GAMBIT task in the workflow must identify the sample as `Shigella` OR the user must input the word `Shigella` somewhere within the input String variable called `expected_taxon`. This requirement serves as the identification of a sample to be of the Shigella genus. + 2. Resfinder or PointFinder predicted resistance to **Ampicillin** + 3. Resfinder or PointFinder predicted resistance to **Azithromycin** + 4. Resfinder or PointFinder predicted resistance to **Ciprofloxacin** + 5. Resfinder or PointFinder predicted resistance to **Ceftriazone** + 6. Resfinder or PointFinder predicted resistance to **Trimethoprim** + 7. Resfinder or PointFinder predicted resistance to **Sulfamethoxazole** + + There are 3 potential outputs for the **`resfinder_predicted_xdr_shigella`** output string**:** + + - **`Not Shigella based on gambit_predicted_taxon or user input`** + - **`Not XDR Shigella`** for samples identified as Shigella by GAMBIT or user input BUT does ResFinder did not predict resistance to **all 6 drugs in XDR definition** + - **`XDR Shigella`** meaning the sample was identified as Shigella and ResFinder/PointFinder did predict resistance to ceftriazone, azithromycin, ciprofloxacin, trimethoprim, sulfamethoxazole, and ampicillin. + + !!! techdetails "ResFinder Technical Details" + + | | Links | + | --- | --- | + | Task | [task_resfinder.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/eld-docs-dev/tasks/gene_typing/task_resfinder.wdl) | + | Software Source Code | https://bitbucket.org/genomicepidemiology/resfinder/src/master/ | + | Software Documentation | https://bitbucket.org/genomicepidemiology/resfinder/src/master/ | + | ResFinder database | https://bitbucket.org/genomicepidemiology/resfinder_db/src/master/ | + | PointFinder database | https://bitbucket.org/genomicepidemiology/pointfinder_db/src/master/ | + | Web-server | https://cge.food.dtu.dk/services/ResFinder/ | + | Original Publication(s) | [ResFinder 4.0 for predictions of phenotypes from genotypes](https://academic.oup.com/jac/article/75/12/3491/5890997) | + +??? task "`TS_MLST`: MLST Profiling" + + [Multilocus sequence typing (MLST)](https://www.pnas.org/doi/10.1073/pnas.95.6.3140?url_ver=Z39.88-2003&rfr_id=ori%3Arid%3Acrossref.org&rfr_dat=cr_pub++0pubmed) is a typing method reflecting population structure. It was developed as a portable, unambiguous method for global epidemiology using PCR, but can be applied to whole-genome sequences *in silico*. MLST is commonly used for pathogen surveillance, ruling out transmission, and grouping related genomes for comparative analysis. + + MLST schemes are taxa-specific. Each scheme uses fragments of typically 7 housekeeping genes ("loci") and has a database associating an arbitrary number with each distinct allele of each locus. Each unique combination of alleles ("allelic profile") is assigned a numbered sequence type (ST). Significant diversification of genomes is captured by changes to the MLST loci via mutational events creating new alleles and STs, or recombinational events replacing the allele and changing the ST. Relationships between STs are based on the number of alleles they share. Clonal complexes share a scheme-specific number of alleles (usually for five of the seven loci). + + !!! tip "MLST Limitations" + Some taxa have multiple MLST schemes, and some MLST schemes are insufficiently robust. + + TheiaProk uses [the MLST tool developed by Torsten Seeman](https://github.com/tseemann/mlst) to assess MLST using traditional [PubMLST](https://pubmlst.org/) typing schemes. + + ??? toggle "Interpretation of MLST results" + + Each MLST results file returns the ST and allele results for one sample. If the alleles and ST are correctly assigned, only a single integer value will be present for each. If an ST cannot be assigned, multiple integers or additional characters will be shown, representing the issues with assignment as described [here](https://github.com/tseemann/mlst/tree/v2.22.0#missing-data). + + ??? toggle "Identifying novel alleles and STs" + + The MLST schemes used in TheiaProk are curated on the PubMLST website.If you identify novel alleles or allelic profiles in your data using TheiaProk's MLST task, you can get these assigned via PubMLST: + + 1. Check that the novel allele or ST has not already been assigned a type on PubMLST. + 1. Download the assembly file from Terra for your sample with the novel allele or ST + 2. Go to the [PubMLST webpage for the organism of interest](https://pubmlst.org/organisms) + 3. Navigate to the organism "Typing" page + 4. Under "Query a sequence" choose "Single sequence" (e.g. [this](https://pubmlst.org/bigsdb?db=pubmlst_hinfluenzae_seqdef&page=sequenceQuery) is the page for *H. influenzae*), select the MLST scheme under "Please select locus/scheme", upload the assembly fasta file, and click submit. + 5. Results will be returned lower on the page. + 2. If the allele or ST has not been typed previously on the PubMLST website (step 1), new allele or ST numbers can be assigned using instructions [here](https://pubmlst.org/submit-data). + + ??? toggle "Taxa with multiple MLST schemes" + + As default, the MLST tool automatically detects the genome's taxa to select the MLST scheme. + + Some taxa have multiple MLST schemes, e.g. the *Escherichia* and Leptospira genera, *Acinetobacter baumannii, Clostridium difficile* and *Streptococcus thermophilus.* Only one scheme will be used by default. + + Users may specify the scheme as an optional workflow input using the `scheme` variable of the "ts_mlst" task. Available schemes are listed [here](https://www.notion.so/TheiaProk-Workflow-Series-89b9c08406094ec78d08a578fe861626?pvs=21) and the scheme name should be provided in quotation marks ("…."). + + If results from multiple MLST schemes are required for the same sample, TheiaProk can be run multiple times specifying non-default schemes. After the first run, output attributes for the workflow (i.e. output column names) must be amended to prevent results from being overwritten. Despite re-running the whole workflow, unmodified tasks will return cached outputs, preventing redundant computation. + + !!! techdetails "TS_MLST Technical Details" + | | Links | + | --- | --- | + | Task | [task_ts_mlst.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_ts_mlst.wdl) | + | Software Source Code | [mlst](https://github.com/tseemann/mlst) | + | Software Documentation | [mlst](https://github.com/tseemann/mlst) | + +??? task "`Prokka`: Assembly Annotation (default)" + + Assembly annotation is available via `Prokka` as default, or alternatively via `Bakta`. When Prokka annotation is used, Bakta is not. + + [`Prokka`](https://github.com/tseemann/prokka) is a prokaryotic genome annotation tool used to identify and describe features of interest within the genome sequence. Prokka annotates there genome by querying databases described [here](https://github.com/tseemann/prokka#databases). + + !!! techdetails "Prokka Technical Details" + | | Links | + | --- | --- | + | Task | [task_prokka.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/eld-docs-dev/tasks/gene_typing/task_prokka.wdl) | + | Software Source Code | [prokka](https://github.com/tseemann/prokka) | + | Software Documentation | [prokka](https://github.com/tseemann/prokka) | + | Original Publication(s) | [Prokka: rapid prokaryotic genome annotation](https://academic.oup.com/bioinformatics/article/30/14/2068/2390517?login=false) | + +??? task "`Bakta`: Assembly Annotation (alternative)" + + Assembly annotation is available via Bakta as an alternative to Prokka. When Bakta annotation is used, Prokka is not. + + Bakta is intended for annotation of Bacteria and plasmids only, and is best described [here](https://github.com/oschwengers/bakta#description)! + + !!! techdetails "Bakta Technical Details" + + | | Links | + | --- | --- | + | Task | [task_bakta.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/gene_typing/task_bakta.wdl) | + | Software Source Code | [bakta](https://github.com/oschwengers/bakta) | + | Software Documentation | | + | Original Publication(s) | [Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000685) | + +??? task "`PlasmidFinder`: Plasmid Identification" + + [`PlasmidFinder`](https://github.com/kcri-tz/plasmidfinder) detects plasmids in totally- or partially-sequenced genomes, and identifies the closest plasmid type in the database for typing purposes. + + ??? toggle "What are plasmids?" + + Plasmids are double-stranded circular or linear DNA molecules that are capable of replication independently of the chromosome and may be transferred between different species and clones. Many plasmids contain resistance or virulence genes, though some do not clearly confer an advantage to their host bacterium. + + !!! techdetails "PlasmidFinder Technical Details" + + | | Links | + | --- | --- | + | Task | [task_plasmidfinder.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/eld-docs-dev/tasks/gene_typing/task_plasmidfinder.wdl) | + | Software Source Code | https://bitbucket.org/genomicepidemiology/plasmidfinder/src/master/ | + | Software Documentation | https://bitbucket.org/genomicepidemiology/plasmidfinder/src/master/ | + | Original Publication(s) | [In Silico Detection and Typing of Plasmids using PlasmidFinder and Plasmid Multilocus Sequence Typing](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4068535/) | + +??? task "**`QC_check`: Check QC Metrics Against User-Defined Thresholds (optional)**" + + The `qc_check` task compares generated QC metrics against user-defined thresholds for each metric. This task will run if the user provides a `qc_check_table` .tsv file. If all QC metrics meet the threshold, the `qc_check` output variable will read `QC_PASS`. Otherwise, the output will read `QC_NA` if the task could not proceed or `QC_ALERT` followed by a string indicating what metric failed. + + The `qc_check` task applies quality thresholds according to the sample taxa. The sample taxa is taken from the `gambit_predicted_taxon` value inferred by the GAMBIT module OR can be manually provided by the user using the `expected_taxon` workflow input. + + ??? toggle "Formatting the _qc_check_table.tsv_" + + - The first column of the qc_check_table lists the taxa that the task will assess and the header of this column must be "taxon". + - Any genus or species can be included as a row of the qc_check_table. However, these taxa must ^^**uniquely**^^ match the sample taxa, meaning that the file can include multiple species from the same genus (Vibrio_cholerae and Vibrio_vulnificus), but not both a genus row and species within that genus (Vibrio and Vibrio cholerae). **The taxa should be formatted with the first letter capitalized and underscores in lieu of spaces.** + - Each subsequent column indicates a QC metric and lists a threshold for each taxa that will be checked. **The column names must exactly match expected values, so we highly recommend copy and pasting from the template files below.** + + ??? toggle "Template _qc_check_table.tsv_ files" + + - TheiaProk_Illumina_PE: [theiaprok_illumina_pe_qc_check_template.tsv](../../assets/files/TheiaProk_Illumina_PE_qc_check_template.tsv) + - TheiaProk_FASTA: [theiaprok_fasta_qc_check_template.tsv](../../assets/files/TheiaProk_FASTA_qc_check_template.tsv) + + !!! warning "Example Purposes Only" + QC threshold values shown are for example purposes only and should not be presumed to be sufficient for every dataset. + + !!! techdetails "QC_Check Technical Details" + + | | Links | + | --- | --- | + | Task | [task_qc_check.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_qc_check.wdl) | + +??? task "`Taxon Tables`: Copy outputs to new data tables based on taxonomic assignment (optional)" + + The `taxon_tables` module, if enabled, will copy sample data to a different data table based on the taxonomic assignment. For example, if an *E. coli* sample is analyzed, the module will copy the sample data to a new table for *E. coli* samples or add the sample data to an existing table. + + !!! tip "" + To implement the `taxon_tables` module, provide a file indicating data table names to copy samples of each taxa to in the `taxon_tables` input variable. No other input variables are needed. + + **Formatting the `taxon_tables` file** + + The `taxon_tables` file must be uploaded a Google storage bucket that is accessible by Terra and should be in the format below. Briefly, the bacterial genera or species should be listed in the leftmost column with the name of the data table to copy samples of that taxon to in the rightmost column. + + | taxon | taxon_table | + | --- | --- | + | Listeria_monocytogenes | lmonocytogenes_specimen | + | Salmonella | salmonella_specimen | + | Escherichia | ecoli_specimen | + | Shigella | shigella_specimen | + | Streptococcus | strep_pneumo_specimen | + | Legionella | legionella_specimen | + | Klebsiella | klebsiella_specimen | + | Mycobacterium | mycobacterium_specimen | + | Acinetobacter | acinetobacter_specimen | + | Pseudomonas | pseudomonas_specimen | + | Staphylococcus | staphyloccus_specimen | + | Neisseria | neisseria_specimen | + + !!! tip "" + There are no output columns for the taxon table task. The only output of the task is that additional data tables will appear for in the Terra workspace for samples matching a taxa in the `taxon_tables` file. + +??? task "`Abricate`: Mass screening of contigs for antimicrobial and virulence genes (optional)" + + The `abricate` module, if enabled, will run abricate with the database defined in `abricate_db` to perform mass screening of contigs for antimicrobial resistance or virulence genes. It comes bundled with multiple databases: NCBI, CARD, ARG-ANNOT, Resfinder, MEGARES, EcOH, PlasmidFinder, Ecoli_VF and VFDB. It only detects acquired resistance genes, **NOT** point mutations + +### Taxa-Specific Tasks + +The TheiaProk workflows automatically activate taxa-specific sub-workflows after the identification of relevant taxa using `GAMBIT`. Alternatively, the user can provide the expected taxa in the `expected_taxon` workflow input to override the taxonomic assignment made by GAMBIT. Modules are launched for all TheiaProk workflows unless otherwise indicated. + +??? toggle "_Acinetobacter baumannii_" + ##### _Acinetobacter baumannii_ {#acinetobacter-baumannii} + + A number of approaches are available in TheiaProk for *A. baumannii* characterization. + + ??? task "`Kaptive`: Capsule and lipooligosaccharide outer core typing" + + The cell-surface capsular polysaccharide (CPS) of *Acinetobacter baumannii* can be used as an epidemiological marker. CPS varies in its composition and structure and is a key determinant in virulence and a target for non-antibiotic therapeutics. Specificity for non-antibiotic therapeutics (e.g. phage therapy) bear particular significance given the extent of antibiotic resistance found in this [ESKAPE](https://journals.asm.org/doi/10.1128/CMR.00181-19) pathogen. + + Biosynthesis and export of CPS is encoded by genes clustering at the K locus (KL). Additional genes associated with CPS biosynthesis and export are sometimes found in other chromosomal locations. The full combination of these genes is summarized as a "[K type](https://www.biorxiv.org/content/10.1101/2022.05.19.492579v1)", described as a "predicted serotype associated with the best match locus". You can read more about this [here](https://github.com/katholt/Kaptive/wiki/Databases-distributed-with-Kaptive#acinetobacter-baunannii-k-and-oc-locus-databases). + + Previously, s[erotyping of *A. baumannii*](https://journals.asm.org/doi/10.1128/jcm.27.12.2713-2716.1989) focused on a major immunogenic polysaccharide which was considered the O antigen for the species. This serotyping approach appears to no longer be used and the serotyping [scheme has not been updated in over 20 years](https://www.karger.com/Article/Abstract/7300). Nonetheless, the O-antigen polysaccharide is attached to lipooligosaccharide, and the outer core (OC) of this lipooligosaccharide varies. Biosynthesis of the outer core lipooligosaccharide is encoded by a cluster of genes at the outer core (OC) locus. + + Variation in the KL and OCL can be characterized with the **Kaptive** tool and its associated [databases](https://github.com/katholt/Kaptive/wiki/Databases-distributed-with-Kaptive#acinetobacter-baunannii-k-and-oc-locus-databases) of numbered *A. baumannii* [K](https://github.com/katholt/Kaptive/blob/master/extras/Acinetobacter_baumannii_KL_reference_information.pdf) and [OC](https://github.com/katholt/Kaptive/blob/master/extras/Acinetobacter_baumannii_OCL_reference_information.pdf) locus variants. Kaptive takes in a genome assembly file (fasta), and assigns the K and OC locus to their numbered variants, provides K type and a description of genes in the K or OC loci and elsewhere in the chromosome, alongside metrics for quality of locus match. A description of [how Kaptive works](https://github.com/katholt/Kaptive/wiki/How-does-Kaptive-work%3F), [explanations of the full output reports](https://github.com/katholt/Kaptive/wiki/How-to-run#summary-table) which are provided in the Terra data table by TheiaProk and [resources for interpreting outputs](https://github.com/katholt/Kaptive/wiki/Interpreting-the-results) are available on the [Kaptive Wiki page](https://github.com/katholt/Kaptive/wiki/How-to-run#summary-table). + + !!! techdetails "Kaptive Technical Details" + + | | Links | + | --- | --- | + | Task | [task_kaptive.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_kaptive.wdl) | + | Software Source Code | [Kaptive on GitHub](https://github.com/katholt/Kaptive/wiki) | + | Software Documentation | https://github.com/katholt/Kaptive/wiki | + | Orginal publications | [Identification of Acinetobacter baumannii loci for capsular polysaccharide (KL) and lipooligosaccharide outer core (OCL) synthesis in genome assemblies using curated reference databases compatible with Kaptive](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000339)
[An update to the database for Acinetobacter baumannii capsular polysaccharide locus typing extends the extensive and diverse repertoire of genes found at and outside the K locus](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000878) | + + ??? task "`AcinetobacterPlasmidTyping`: Acinetobacter plasmid detection" + + *Acinetobacter* plasmids are not included in the [PlasmidFinder](https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) database. Instead, the [AcinetobacterPlasmidTyping](https://github.com/MehradHamidian/AcinetobacterPlasmidTyping) database contains variants of the plasmid *rep* gene for *A. baumannii* plasmid identification. When matched with >/= 95 % identity, this represents a typing scheme for *Acinetobacter baumannii* plasmids. In TheiaProk, we use the tool [abricate](https://github.com/tseemann/abricate) to query our assemblies against this database. + + The bioinformatics software for querying sample assemblies against the AcinetobacterPlasmidTyping database is [Abricate](https://github.com/tseemann/abricate). The WDL task simply runs abricate, and the Acinetobacter Plasmid database and default setting of 95% minimum identity are set in the [merlin magic sub-workflow](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/workflows/wf_merlin_magic.wdl#L44). + + !!! techdetails "AcinetobacterPlasmidTyping Technical Details" + + | | Links | + | --- | --- | + | Task | [task_abricate.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/gene_typing/task_abricate.wdl) | + | Database and documentation | [https://github.com/MehradHamidian/AcinetobacterPlasmidTyping](https://github.com/MehradHamidian/AcinetobacterPlasmidTyping/tree/v1.0.0) | + | Software Source Code and documentation | [abricate on GitHub](https://github.com/tseemann/abricate) | + | Original Publication(s) | [Detection and Typing of Plasmids in *Acinetobacter baumannii* Using *rep* Genes Encoding Replication Initiation Proteins](https://journals.asm.org/doi/10.1128/spectrum.02478-22?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub%20%200pubmed) | + + ??? task "Acinetobacter MLST" + + Two MLST schemes are available for *Acinetobacter*. The Pasteur scheme is run by default, given [significant problems with the Oxford scheme have been described](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6510311/). Should users with to alternatively or additionally use the Oxford MLST scheme, see the section above on MLST. The Oxford scheme is activated in TheiaProk with the MLST `scheme` input as "abaumannii". + + ??? task "*bla*OXA-51-like gene detection" + + The *bla*OXA-51-like genes, also known as _oxaAB_, are considered intrinsic to _Acinetobacter baumannii_ but are not found in other *Acinetobacter* species. **Identification of a *bla*OXA-51-like gene is therefore considered to confirm the species' identity as _A. baumannii_.** + + NCBI's AMRFinderPlus, which is implemented as a core module in TheiaProk, detects the *bla*OXA-51-like genes. This may be used to confirm the species, in addition to the GAMBIT taxon identification. The *bla*OXA-51-like genes act as carbapenemases when an IS*Aba1* is found 7 bp upstream of the gene. Detection of this IS is not currently undertaken in TheiaProk. + +??? toggle "_Escherichia_ or _Shigella_ spp" + ##### _Escherichia_ or _Shigella_ spp {#escherichia-or-shigella} + + The *Escherichia* and *Shigella* genera are [difficult to differentiate as they do not comply with genomic definitions of genera and species](https://www.sciencedirect.com/science/article/abs/pii/S1286457902016374). Consequently, when either _Escherichia_ or _Shigella_ are identified by GAMBIT, all tools intended for these taxa are used. + + `SerotypeFinder` and `ECTyper` are intended for analysis of *E. coli*. Both tools are used as there are occasional discrepancies between the serotypes predicted. This primarily arises due to differences in the databases used by each tool. + + ??? task "`SerotypeFinder`: Serotyping" + + [SerotypeFinder](https://bitbucket.org/genomicepidemiology/serotypefinder/src/master/), from the Centre for Genomic Epidemiology (CGE), identifies the serotype of total or partially-sequenced isolates of *E. coli*. + + !!! techdetails "SerotypeFinder Technical Details" + + | | Links | + | --- | --- | + | Task | [task_serotypefinder.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_serotypefinder.wdl) | + | Software Source Code | https://bitbucket.org/genomicepidemiology/serotypefinder/src/master/ | + | Software Documentation | https://bitbucket.org/genomicepidemiology/serotypefinder/src/master/ | + | Original Publication(s) | [Rapid and Easy In Silico Serotyping of Escherichia coli Isolates by Use of Whole-Genome Sequencing Data](https://journals.asm.org/doi/10.1128/JCM.00008-15) | + + ??? task "`ECTyper`: Serotyping" + + [ECTyper](https://github.com/phac-nml/ecoli_serotyping) is a serotyping module for *E. coli*. In TheiaProk, we are using assembly files as input. + + !!! techdetails "ECTyper Technical Details" + + | | Links | + | --- | --- | + | Task | [task_ectyper.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_ectyper.wdl) | + | Software Source Code | [ECTyper on GitHub](https://github.com/phac-nml/ecoli_serotyping) | + | Software Documentation | [ECTyper on GitHub](https://github.com/phac-nml/ecoli_serotyping) | + | Orginal publication | [ECTyper: in silico Escherichia coli serotype and species prediction from raw and assembled whole-genome sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8767331/) | + + `VirulenceFinder` identifies virulence genes in total or partial sequenced isolates of bacteria. Currently, only *E. coli* is supported in TheiaProk workflows. + + ??? task "`VirulenceFinder`: Virulence gene identification" + + VirulenceFinder in TheiaProk is only run on assembly files due to issues regarding discordant results when using read files on the web application versus the command-line. + + !!! techdetails "VirulenceFinder Technical Details" + + | | Links | + | --- | --- | + | Task | [task_virulencefinder.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/task_virulencefinder.wdl) | + | Software Source Code | [**VirulenceFinder**](https://bitbucket.org/genomicepidemiology/virulencefinder/src/master/) | + | Software Documentation | [**VirulenceFinder**](https://bitbucket.org/genomicepidemiology/virulencefinder/src/master/) | + | Original Publication(s) | [Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia co](https://pubmed.ncbi.nlm.nih.gov/24574290/) | + + `ShigaTyper` and `ShigEiFinder` are intended for differentiation and serotype prediction for any *Shigella* species and Enteroinvasive *Escherichia coli* (EIEC). You can read about differences between these [here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC517479/) and [here](https://www.microbiologyresearch.org/content/journal/micro/10.1099/00221287-144-9-2667). ShigEiFinder can be run using either the assembly (default) or reads. These tasks will report if the samples are neither *Shigella* nor EIEC. + + ??? task "`ShigaTyper`: *Shigella*/EIEC differentiation and serotyping ==_for Illumina and ONT only_==" + + ShigaTyper predicts *Shigella* spp serotypes from Illumina or ONT read data. If the genome is not *Shigella* or EIEC, the results from this tool will state this. In the notes it provides, it also reports on the presence of *ipaB* which is suggestive of the presence of the "virulent invasion plasmid". + + !!! techdetails "ShigaTyper Technical Details" + + | | Links | + | --- | --- | + | Task | [task_shigatyper.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_shigatyper.wdl) | + | Software Source Code | [ShigaTyper on GitHub](https://github.com/CFSAN-Biostatistics/shigatyper) | + | Software Documentation | https://github.com/CFSAN-Biostatistics/shigatyper | + | Origin publication | [In Silico Serotyping Based on Whole-Genome Sequencing Improves the Accuracy of Shigella Identification](https://doi.org/10.1128/AEM.00165-19) | + + ??? task "`ShigEiFinder`: *Shigella*/EIEC differentiation and serotyping ==_using the assembly file as input_==" + + ShigEiFinder differentiates *Shigella* and enteroinvasive *E. coli* (EIEC) using cluster-specific genes, identifies some serotypes based on the presence of O-antigen and H-antigen genes, and predicts the number of virulence plasmids. The `shigeifinder` task operates on assembly files. + + !!! techdetails "ShigEiFinder Technical Details" + + | | Links | + | --- | --- | + | Task | [task_shigeifinder.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/42659de535202cc37ab509b38987406ad0360dd4/tasks/species_typing/task_shigeifinder.wdl#L3) | + | Software Source Code | [ShigEiFinder on GitHub](https://github.com/LanLab/ShigEiFinder) | + | Software Documentation | [ShigEiFinder on GitHub](https://github.com/LanLab/ShigEiFinder) | + | Origin publication | [Cluster-specific gene markers enhance Shigella and enteroinvasive Escherichia coli in silico serotyping](https://pubmed.ncbi.nlm.nih.gov/34889728/) | + + ??? task "`ShigEiFinder_reads`: *Shigella*/EIEC differentiation and serotyping using Illumina read files as input (optional) ==_ for Illumina data only_==" + + ShigEiFinder differentiates *Shigella* and enteroinvasive *E. coli* (EIEC) using cluster-specific genes, identifies some serotypes based on the presence of O-antigen and H-antigen genes, and predicts the number of virulence plasmids. The `shigeifinder_reads` task performs on read files. + + !!! techdetails "ShigEiFinder_reads Technical Details" + | | Links | + | --- | --- | + | Task | [task_shigeifinder.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/42659de535202cc37ab509b38987406ad0360dd4/tasks/species_typing/task_shigeifinder.wdl#L81) | + | Software Source Code | [ShigEiFinder on GitHub](https://github.com/LanLab/ShigEiFinder) | + | Software Documentation | [ShigEiFinder on GitHub](https://github.com/LanLab/ShigEiFinder) | + | Origin publication | [Cluster-specific gene markers enhance Shigella and enteroinvasive Escherichia coli in silico serotyping](https://pubmed.ncbi.nlm.nih.gov/34889728/) | + + `SonneiTyper` is run only when GAMBIT predicts the *S. sonnei* species. This is the most common *Shigella* species in the United States. + + ??? task "`SonneiTyper`**: *Shigella sonnei* identification, genotyping, and resistance mutation identification ==_for Illumina and ONT data only_==" + + SonneiTyper identifies *Shigella sonnei,* and uses **single-nucleotide variants for genotyping and prediction of quinolone resistance in *gyrA* (S83L, D87G, D87Y) and *parC* (S80I). Outputs are provided in [this](https://github.com/katholt/sonneityping#example-output) format. + + SonneiTyper is a wrapper script around another tool, Mykrobe, that analyses the *S. sonnei* genomes. + + !!! techdetails "SonneiTyper Technical Details" + + | | Links | + | --- | --- | + | Task | [task_sonneityping.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_sonneityping.wdl) | + | Software Source Code | [Mykrobe](https://github.com/Mykrobe-tools/mykrobe), [sonneityping](https://github.com/katholt/sonneityping) | + | Software Documentation | https://github.com/Mykrobe-tools/mykrobe/wiki, [sonneityping](https://github.com/katholt/sonneityping) | + | Original Publication(s) | [Global population structure and genotyping framework for genomic surveillance of the major dysentery pathogen, *Shigella sonnei*](https://www.nature.com/articles/s41467-021-22700-4) | + + **Shigella XDR prediction.** Please see the documentation section above for ResFinder for details regarding this taxa-specific analysis. + +??? toggle "_Haemophilus influenzae_" + ##### _Haemophilus influenzae_ {#haemophilus-influenzae} + ??? task "`hicap`: Sequence typing" + Identification of *cap* locus serotype in *Haemophilus influenzae* assemblies with [hicap](https://github.com/scwatts/hicap). + + The *cap* locus of *H. influenzae* is categorised into 6 different groups based on serology (a-f). There are three functionally distinct regions of the *cap* locus, designated `region I`, `region II`, and `region III`. Genes within `region I` (`bexABCD`) and `region III` (`hcsAB`) are associated with transport and post-translation modification. The `region II` genes encode serotype-specific proteins, with each serotype (a-f) having a distinct set of genes. *cap* loci are often subject to structural changes (e.g. duplication, deletion) making the process of *in silico* typing and characterisation of loci difficult. + + `hicap` automates the identification of the *cap* locus, describes the structural layout, and performs *in silico* serotyping. + + !!! techdetails "hicap Technical Details" + + | | Links | + | --- | --- | + | Task | [task_hicap.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/task_hicap.wdl) | + | Software Source Code | [hicap on GitHub](https://github.com/scwatts/hicap) | + | Software Documentation | [hicap on GitHub](https://github.com/scwatts/hicap) | + | Original Publication(s) | [hicap: In Silico Serotyping of the Haemophilus influenzae Capsule Locus](https://doi.org/10.7717/peerj.5261) | + +??? toggle "_Klebsiella_ spp" + ##### _Klebsiella_ spp {#klebsiella} + ??? task "`Kleborate`: Species identification, MLST, serotyping, AMR and virulence characterization" + + [Kleborate](https://github.com/katholt/Kleborate) is a tool to identify the *Klebsiella* species, MLST sequence type, serotype, virulence factors (ICE*Kp* and plasmid associated), and AMR genes and mutations. Serotyping is based on the capsular (K antigen) and lipopolysaccharide (LPS) (O antigen) genes. The resistance genes identified by Kleborate are described [here](https://github.com/katholt/Kleborate/wiki/Antimicrobial-resistance). + + !!! techdetails "Kleborate Technical Details" + + | | Links | + | --- | --- | + | Task | [task_kleborate.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_kleborate.wdl) | + | Software Source Code | [kleborate on GitHub](https://github.com/katholt/Kleborate) | + | Software Documentation | https://github.com/katholt/Kleborate/wiki | + | Orginal publication | [A genomic surveillance framework and genotyping tool for Klebsiella pneumoniae and its related species complex](https://www.nature.com/articles/s41467-021-24448-3)
[Identification of Klebsiella capsule synthesis loci from whole genome data](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000102) | + +??? toggle "_Legionella pneumophila_" + ##### _Legionella pneumophila_ {#legionella-pneumophila} + ??? task "`Legsta`: Sequence-based typing" + + [Legsta](https://github.com/tseemann/legsta) performs a sequence-based typing of *Legionella pneumophila*, with the intention of being used for outbreak investigations. + + !!! techdetails "Legsta Technical Details" + + | | Links | + | --- | --- | + | Task | [task_legsta.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_legsta.wdl) | + | Software Source Code | [Legsta](https://github.com/tseemann/legsta) | + | Software Documentation | [Legsta](https://github.com/tseemann/legsta) | + +??? toggle "_Listeria monocytogenes_" + ##### _Listeria monocytogenes_ {#listeria-monocytogenes} + ??? task "`LisSero`: Serogroup prediction" + + [LisSero](https://github.com/MDU-PHL/LisSero) performs serogroup prediction (1/2a, 1/2b, 1/2c, or 4b) for *Listeria monocytogenes* based on the presence or absence of five genes, *lmo1118*, *lmo0737*, ORF2110, ORF2819, and *prs*. These do not predict somatic (O) or flagellar (H) biosynthesis. + + !!! techdetails "LisSero Technical Details" + + | | Links | + | --- | --- | + | Task | [task_lissero.wd](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_lissero.wdl) | + | Software Source Code | [LisSero](https://github.com/MDU-PHL/LisSero) | + | Software Documentation | [LisSero](https://github.com/MDU-PHL/LisSero) | + +??? toggle "_Mycobacterium tuberculosis_" + ##### _Mycobacterium tuberculosis_ {#mycobacterium-tuberculosis} + ??? task "`TBProfiler`: Lineage and drug susceptibility prediction ==_for Illumina and ONT only_==" + + [TBProfiler](https://github.com/jodyphelan/TBProfiler) identifies *Mycobacterium tuberculosis* complex species, lineages, sub-lineages and drug resistance-associated mutations. + + !!! techdetails "TBProfiler Technical Details" + + | | Links | + | --- | --- | + | Task | [task_tbprofiler.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/mycobacterium/task_tbprofiler.wdl) | + | Software Source Code | [TBProfiler on GitHub](https://github.com/jodyphelan/TBProfiler) | + | Software Documentation | https://jodyphelan.gitbook.io/tb-profiler/ | + | Original Publication(s) | [Integrating informatics tools and portable sequencing technology for rapid detection of resistance to anti-tuberculous drugs](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-019-0650-x) | + + ??? task "`tbp-parser`: Interpretation and Parsing of TBProfiler JSON outputs; ==_requires TBProfiler and `tbprofiler_additonal_outputs = true`_==" + + [tbp-parser](https://github.com/theiagen/tbp-parser/) adds useful drug resistance interpretation by applying expert rules and organizing the outputs from TBProfiler. Please note that this tool has **not** been tested on ONT data and although it is available, result accuracy should be considered carefully. To understand this module and its functions, [please examine the README found with the source code here](https://github.com/theiagen/tbp-parser/). + + !!! techdetails "tbp-parser Technical Details" + + | | Links | + | --- | --- | + | Task | [task_tbp_parser.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/mycobacterium/task_tbp_parser.wdl) | + | Software Source Code | [tbp-parser](https://github.com/theiagen/tbp-parser/) | + | Software Documentation | [tbp-parser](https://theiagen.github.io/tbp-parser) | + + ??? task "`Clockwork`: Decontamination of input read files ==_for Illumina PE only_==" + + [Clockwork](https://github.com/iqbal-lab-org/clockwork/wiki) decontaminates paired-end data by removing all reads that do not match the H37Rv genome or are unmapped. + + !!! techdetails "Clockwork Technical Details" + + | | Links | + | --- | --- | + | Task | [task_clockwork.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/mycobacterium/task_clockwork.wdl) | + | Software Source Code | [clockwork](https://github.com/iqbal-lab-org/clockwork) | + | Software Documentation | | + +??? toggle "_Neisseria_ spp" + ##### _Neisseria_ spp {#neisseria} + ??? task "`ngmaster`: _Neisseria gonorrhoeae_ sequence typing" + + NG-MAST is currently the most widely used method for epidemiological surveillance of *Neisseria gonorrhoea.* This tool is targeted at clinical and research microbiology laboratories that have performed WGS of *N. gonorrhoeae* isolates and wish to understand the molecular context of their data in comparison to previously published epidemiological studies. As WGS becomes more routinely performed, *NGMASTER* +  has been developed to completely replace PCR-based NG-MAST, reducing time and labour costs. + + The NG-STAR offers a standardized method of classifying seven well-characterized genes associated antimicrobial resistance in *N. gonorrhoeae* (*penA, mtrR, porB, ponA, gyrA, parC* and 23S rRNA) to three classes of antibiotics (cephalosporins, macrolides and fluoroquinolones). + + ngmaster combines two tools: NG-MAST (*in silico* multi-antigen sequencing typing) and NG-STAR (sequencing typing for antimicrobial resistance). + + !!! techdetails "ngmaster Technical Details" + + | | Links | + | --- | --- | + | Task | [task_ngmaster.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_ngmaster.wdl) | + | Software Source Code | [ngmaster](https://github.com/MDU-PHL/ngmaster) | + | Software Documentation | [ngmaster](https://github.com/MDU-PHL/ngmaster) | + | Original Publication(s) | [NGMASTER: *in silico* multi-antigen sequence typing for *Neisseria gonorrhoeae*](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000076) | + + ??? task "`meningotype`: _Neisseria meningitidis_ serotyping" + + This tool performs serotyping, MLST, finetyping (of *porA*, *fetA*, and *porB*), and Bexsero Antigen Sequencing Typing (BAST). + + !!! techdetails "meningotype Technical Details" + + | | Links | + | --- | --- | + | Task | [task_meningotype.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_meningotype.wdl) | + | Software Source Code | [meningotype](https://github.com/MDU-PHL/meningotype) | + | Software Documentation | [meningotype](https://github.com/MDU-PHL/meningotype) | + +??? toggle "_Pseudomonas aeruginosa_" + ##### _Pseudomonas aeruginosa_ {#pseudomonas-aeruginosa} + + ??? task "`pasty`: Serotyping" + + `pasty` is a tool for *in silico* serogrouping of *Pseudomonas aeruginosa* isolates. pasty was developed by Robert Petit, based on the [PAst](https://github.com/Sandramses/PAst) tool from the Centre for Genomic Epidemiology. + + !!! techdetails "pasty Technical Details" + + | | Links | + | --- | --- | + | Task | [task_pasty.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_pasty.wdl) | + | Software Source Code | [pasty](https://github.com/rpetit3/pasty) | + | Software Documentation | [pasty](https://github.com/rpetit3/pasty) | + | Original Publication(s) | [Application of Whole-Genome Sequencing Data for O-Specific Antigen Analysis and In Silico Serotyping of Pseudomonas aeruginosa Isolates.](https://journals.asm.org/doi/10.1128/JCM.00349-16) | + +??? toggle "_Salmonella_ spp" + ##### _Salmonella_ spp {#salmonella} + + Both SISTR and SeqSero2 are used for serotyping all *Salmonella* spp. Occasionally, the predicted serotypes may differ between SISTR and SeqSero2. When this occurs, differences are typically small and analogous, and are likely as a result of differing source databases. More information about Salmonella serovar nomenclature can be found [here](https://www.happykhan.com/posts/binfie-guide-serovar/). For *Salmonella* Typhi, genotyphi is additionally run for further typing. + + ??? task "`SISTR`: Salmonella serovar prediction" + + [SISTR](https://github.com/phac-nml/sistr_cmd) performs *Salmonella spp* serotype prediction using antigen gene and cgMLST gene alleles. In TheiaProk. SISTR is run on genome assemblies, and uses the default database setting (smaller "centroid" alleles or representative alleles instead of the full set of cgMLST alleles). It also runs a QC mode to determine the level of confidence in the serovar prediction (see [here](https://github.com/phac-nml/sistr_cmd#qc-by-sistr_cmd---qc)). + + !!! techdetails "SISTR Technical Details" + + | | Links | + | --- | --- | + | Task | [task_sistr.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_sistr.wdl) | + | Software Source Code | [SISTR](https://github.com/phac-nml/sistr_cmd) | + | Software Documentation | [SISTR](https://github.com/phac-nml/sistr_cmd) | + | Original Publication(s) | [The Salmonella In Silico Typing Resource (SISTR): an open web-accessible tool for rapidly typing and subtyping draft Salmonella genome assemblies.](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0147101) | + + ??? task "`SeqSero2`: Serotyping" + + [SeqSero2](https://github.com/denglab/SeqSero2) is a tool for *Salmonella* serotype prediction. In the TheiaProk Illumina and ONT workflows, SeqSero2 takes in raw sequencing reads and performs targeted assembly of serotype determinant alleles, which can be used to predict serotypes including contamination between serotypes. Optionally, SeqSero2 can take the genome assembly as input. + + !!! techdetails "SeqSero2 Technical Details" + | | Links | + | --- | --- | + | Task | [task_seqsero2.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_seqsero2.wdl) | + | Software Source Code | [SeqSero2](https://github.com/denglab/SeqSero2) | + | Software Documentation | [SeqSero2](https://github.com/denglab/SeqSero2) | + | Original Publication(s) | [Salmonella serotype determination utilizing high-throughput genome sequencing data.](https://journals.asm.org/doi/10.1128/JCM.00323-15)
[SeqSero2: rapid and improved Salmonella serotype determination using whole genome sequencing data.](https://journals.asm.org/doi/10.1128/AEM.01746-19) | + + ??? task "`genotyphi`: _Salmonella_ Typhi lineage, clade, subclade and plasmid typing, AMR prediction ==_for Illumina and ONT only_==" + + [`genotyphi`](https://github.com/katholt/genotyphi) is activated upon identification of the "Typhi" serotype by SISTR or SeqSero2. `genotyphi` divides the *Salmonella enterica* serovar Typhi population into detailed lineages, clades, and subclades. It also detects mutations in the quinolone-resistance determining regions, acquired antimicrobial resistance genes, plasmid replicons, and subtypes of the IncHI1 plasmid which is associated with multidrug resistance. + + TheiaProk uses the [Mykrobe implementation](https://github.com/katholt/genotyphi/blob/main/README.md#mykrobe-implementation) of genotyphi that takes raw sequencing reads as input. + + !!! techdetails "genotyphi Technical Details" + + | | Links | + | --- | --- | + | Task | [task_genotyphi.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_genotyphi.wdl) | + | Software Source Code | [genotyphi](https://github.com/katholt/genotyphi) | + | Software Documentation | https://github.com/katholt/genotyphi/blob/main/README.md#mykrobe-implementation | + | Orginal publication | [An extended genotyping framework for Salmonella enterica serovar Typhi, the cause of human typhoid](https://www.nature.com/articles/ncomms12827/)
[Five Years of GenoTyphi: Updates to the Global Salmonella Typhi Genotyping Framework](https://academic.oup.com/jid/article/224/Supplement_7/S775/6358992?login=false) | + +??? toggle "_Staphyloccocus aureus_" + ##### _Staphyloccocus aureus_ {#staphyloccocus-aureus} + + ??? task "`spatyper`: Sequence typing" + + Given a fasta file or multiple fasta files, this script identifies the repeats and the order and generates a *spa* type. The repeat sequences and repeat orders found on http://spaserver2.ridom.de/ are used to identify the spa type of each enriched sequence. Ridom *spa* type and the genomics repeat sequence are then reported back to the user. + + !!! techdetails "spatyper Technical Details" + + | | Links | + | --- | --- | + | Task | [task_spatyper.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_spatyper.wdl) | + | Software Source Code | [spatyper](https://github.com/HCGB-IGTP/spaTyper) | + | Software Documentation | [spatyper](https://github.com/HCGB-IGTP/spaTyper) | + + ??? task "`staphopia-sccmec`: Sequence typing" + + This tool assigns a SCCmec type by BLAST the SCCmec primers against an assembly. `staphopia-sccmec`reports `True` for exact primer matches and `False` for at least 1 base pair difference. The [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) is also reported. + + !!! techdetails "staphopia-sccmec Technical Details" + + | | Links | + | --- | --- | + | Task | [task_staphopiasccmec.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_staphopiasccmec.wdl) | + | Software Source Code | [staphopia-sccmec](https://github.com/staphopia/staphopia-sccmec) | + | Software Documentation | [staphopia-sccmec](https://github.com/staphopia/staphopia-sccmec) | + | Original Publication(s) | [*Staphylococcus aureus* viewed from the perspective of 40,000+ genomes](https://doi.org/10.7717/peerj.5261) | + + ??? task "`agrvate`: Sequence typing" + + This tool identifies the *agr* locus type and reports possible variants in the *agr* operon. AgrVATE accepts a *S. aureus* genome assembly as input and performs a kmer search using an Agr-group specific kmer database to assign the Agr-group. The *agr* operon is then extracted using *in-silico* PCR and variants are called using an Agr-group specific reference operon. + + !!! techdetails "agrvate Technical Details" + + | | Links | + | --- | --- | + | Task | [task_agrvate.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_agrvate.wdl) | + | Software Source Code | [agrVATE](https://github.com/VishnuRaghuram94/AgrVATE) | + | Software Documentation | [agrVATE](https://github.com/VishnuRaghuram94/AgrVATE) | + | Original Publication(s) | [Species-Wide Phylogenomics of the *Staphylococcus aureus Agr* Operon Revealed Convergent Evolution of Frameshift Mutations](https://doi.org/10.1128/spectrum.01334-21) | + +??? toggle "_Streptococcus pneumoniae_" + ##### _Streptococcus pneumoniae_ {#streptococcus-pneumoniae} + + ??? task "`PopPUNK`: Global Pneumococcal Sequence Cluster typing" + + Global Pneumococcal Sequence Clusters (GPSC) define and name pneumococcal strains. GPSC designation is undertaken using the PopPUNK software and GPSC database as described in the file below, obtained from [here](https://www.pneumogen.net/gps/training_command_line.html). + + :file: [GPSC_README_PopPUNK2.txt](../../assets/files/GPSC_README_PopPUNK2.txt) + + !!! tip "Interpreting GPSC results" + - In the `*_external_clusters.csv` novel clusters are assigned NA. For isolates that are assigned a novel cluster and pass QC, you can email [globalpneumoseq@gmail.com](mailto:globalpneumoseq@gmail.com) to have these novel clusters added to the database. + - Unsampled diversity in the pneumococcal population may represent missing variation that links two GPS clusters. When this is discovered, GPSCs are merged and the merge history is indicated. For example, if GPSC23 and GPSC362 merged, the GPSC would be reported as GPSC23, with a merge history of GPSC23;362. + + !!! techdetails "PopPUNK Technical Details" + + | | Links | + | --- | --- | + | Task | [task_poppunk_streppneumo.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_poppunk_streppneumo.wdl) | + | GPSC database | https://www.pneumogen.net/gps/training_command_line.html | + | Software Source Code | [PopPunk](https://github.com/bacpop/PopPUNK) | + | Software Documentation | https://poppunk.readthedocs.io/en/latest/ | + | Original Publication(s) | [Fast and flexible bacterial genomic epidemiology with PopPUNK](https://genome.cshlp.org/content/29/2/304) | + + ??? task "`SeroBA`: Serotyping ==_for Illumina_PE only_==" + + Streptococcus pneumoniae serotyping is performed with SeroBA. + + !!! techdetails "SeroBA Technical Details" + + | | Links | + | --- | --- | + | Task | [task_seroba.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_seroba.wdl) | + | Software Source Code | [SeroBA](https://github.com/sanger-pathogens/seroba) | + | Software Documentation | https://sanger-pathogens.github.io/seroba/ | + | Original Publication(s) | [SeroBA: rapid high-throughput serotyping of Streptococcus pneumoniae from whole genome sequence data](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000186) | + + ??? task "`pbptyper`: Penicillin-binding protein genotyping" + + The Penicillin-binding proteins (PBP) are responsible for the minimum inhibitory concentration phenotype for beta-lactam antibiotic. In *Streptococcus pneumoniae*, these PBP genes can be identified and typed with PBPTyper. + + !!! techdetails "pbptyper Technical Details" + + | | Links | + | --- | --- | + | Task | [task_pbptyper.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_pbptyper.wdl) | + | Software Source Code | [pbptyper](https://github.com/rpetit3/pbptyper) | + | Software Documentation | [pbptyper](https://github.com/rpetit3/pbptyper) | + | Original Publication(s) | [Penicillin-binding protein transpeptidase signatures for tracking and predicting β-lactam resistance levels in Streptococcus pneumoniae](https://journals.asm.org/doi/full/10.1128/mBio.00756-16) | + +??? toggle "_Streptococcus pyogenes_" + ##### _Streptococcus pyogenes_ {#streptococcus-pyogenes} + ??? task "`emm-typing-tool`: Sequence typing ==_for Illumina_PE only_==" + + emm-typing of *Streptococcus pyogenes* raw reads. Assign emm type and subtype by querying the CDC M-type specific database. + + !!! techdetails "emm-typing-tool Technical Details" + | | Links | + | --- | --- | + | Task | [task_emmtypingtool.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/task_emmtypingtool.wdl) | + | Software Source Code | [emm-typing-tool](https://github.com/ukhsa-collaboration/emm-typing-tool) | + | Software Documentation | [emm-typing-tool](https://github.com/ukhsa-collaboration/emm-typing-tool) | + +??? toggle "_Vibrio_ spp" + ##### _Vibrio_ spp {#vibrio} + ??? task "`SRST2`: Vibrio characterization ==_for Illumina only_==" + + The `SRST2 Vibrio characterization` task detects sequences for *Vibrio* spp characterization using Illumina sequence reads and a database of target sequence that are traditionally used in PCR methods. The sequences included in the database are as follows: + + | Sequence name | Sequence role | Purpose in database | + | --- | --- | --- | + | *toxR* | Transcriptional activator | Species marker where presence identifies *V. cholerae* | + | *ompW* | Outer Membrane Protein | Species marker where presence identifies *V. cholerae* | + | *ctxA* | Cholera toxin | Indicates cholera toxin production | + | *tcpA*_classical | Toxin co-pilus A allele associated with the Classical biotype | Used to infer identity as Classical biotype | + | tcpA_ElTor | Toxin co-pilus A allele associated with the El Tor biotype | Used to infer identity as El Tor biotype | + | *wbeN* | O antigen encoding region | Used to infer identity as O1 serogroup | + | *wbfR* | O antigen encoding region | Used to infer identity as O139 serogroup | + + !!! techdetails "SRST2 Technical Details" + + | | Links | + | --- | --- | + | Task | [task_srst2_vibrio.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_srst2_vibrio.wdl) | + | Software Source Code | [srst2](https://github.com/katholt/srst2) | + | Software Documentation | [srst2](https://github.com/katholt/srst2) | + | Database Description | [Docker container](https://github.com/StaPH-B/docker-builds/tree/master/srst2/0.2.0-vibrio-230224) | + + ??? task "`Abricate`: Vibrio characterization" + + The `Abricate` Vibrio characterization task detects sequences for *Vibrio* spp characterization using genome assemblies and the abricate "vibrio" database. The sequences included in the database are as follows: + + | Sequence name | Sequence role | Purpose in database | + | --- | --- | --- | + | *toxR* | Transcriptional activator | Species marker where presence identifies *V. cholerae* | + | *ompW* | Outer Membrane Protein | Species marker where presence identifies *V. cholerae* | + | *ctxA* | Cholera toxin | Indicates cholera toxin production | + | *tcpA*_classical | Toxin co-pilus A allele associated with the Classical biotype | Used to infer identity as Classical biotype | + | tcpA_ElTor | Toxin co-pilus A allele associated with the El Tor biotype | Used to infer identity as El Tor biotype | + | *wbeN* | O antigen encoding region | Used to infer identity as O1 serogroup | + | *wbfR* | O antigen encoding region | Used to infer identity as O139 serogroup | + + !!! techdetails "Abricate Technical Details" + | | Links | + | --- | --- | + | Task | [task_abricate_vibrio.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/species_typing/task_srst2_vibrio.wdl) | + | Software Source Code | [abricate](https://github.com/tseemann/abricate) | + | Software Documentation | [abricate](https://github.com/tseemann/abricate) | + | Database Description | [Docker container](https://github.com/StaPH-B/docker-builds/tree/master/abricate/1.0.1-vibrio-cholera) | + +### Outputs + +| **Variable** | **Type** | **Description** | **Workflow** | +|---|---|---|---| +| abricate_abaum_database | String | Database of reference A. baumannii plasmid typing genes used for plasmid typing | FASTA, ONT, PE, SE | +| abricate_abaum_docker | String | Docker file used for running abricate | FASTA, ONT, PE, SE | +| abricate_abaum_plasmid_tsv | File | https://github.com/tseemann/abricate#output containing a row for each A. baumannii plasmid type gene found in the sample | FASTA, ONT, PE, SE | +| abricate_abaum_plasmid_type_genes | String | A. baumannii Plasmid typing genes found in the sample; from GENE column in https://github.com/tseemann/abricate#output | FASTA, ONT, PE, SE | +| abricate_abaum_version | String | Version of abricate used for A. baumannii plasmid typing | FASTA, ONT, PE, SE | +| abricate_database | String | Database of reference used with Abricate | FASTA, ONT, PE, SE | +| abricate_docker | String | Docker file used for running abricate | FASTA, ONT, PE, SE | +| abricate_genes | String | Genes found in the sample; from GENE column in https://github.com/tseemann/abricate#output | FASTA, ONT, PE, SE | +| abricate_results_tsv | File | https://github.com/tseemann/abricate#output containing a row for each gene found in the sample | FASTA, ONT, PE, SE | +| abricate_version | String | Version of abricate used for A. baumannii plasmid typing | FASTA, ONT, PE, SE | +| abricate_vibrio_biotype | String | Biotype classification according to tcpA gene sequence (Classical or ElTor) | FASTA, ONT, PE, SE | +| abricate_vibrio_ctxA | String | Presence or absence of the ctxA gene | FASTA, ONT, PE, SE | +| abricate_vibrio_detailed_tsv | File | Detailed ABRicate output file | FASTA, ONT, PE, SE | +| abricate_vibrio_ompW | String | Presence or absence of the ompW gene | FASTA, ONT, PE, SE | +| abricate_vibrio_serogroup | String | Serotype classification as O1 (wbeN gene), O139 (wbfR gene) or not detected. | FASTA, ONT, PE, SE | +| abricate_vibrio_toxR | String | Presence or absence of the toxR gene | FASTA, ONT, PE, SE | +| abricate_vibrio_version | String | The abricate version run | FASTA, ONT, PE, SE | +| agrvate_agr_canonical | String | Canonical or non-canonical agrD | FASTA, ONT, PE, SE | +| agrvate_agr_group | String | Agr group | FASTA, ONT, PE, SE | +| agrvate_agr_match_score | String | Match score for agr group | FASTA, ONT, PE, SE | +| agrvate_agr_multiple | String | If multiple agr groups were found | FASTA, ONT, PE, SE | +| agrvate_agr_num_frameshifts | String | Number of frameshifts found in CDS of extracted agr operon | FASTA, ONT, PE, SE | +| agrvate_docker | String | The docker used for AgrVATE | FASTA, ONT, PE, SE | +| agrvate_results | File | A gzipped tarball of all results | FASTA, ONT, PE, SE | +| agrvate_summary | File | The summary file produced | FASTA, ONT, PE, SE | +| agrvate_version | String | The version of AgrVATE used | FASTA, ONT, PE, SE | +| amrfinderplus_all_report | File | Output TSV file from AMRFinderPlus (described https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#fields) | FASTA, ONT, PE, SE | +| amrfinderplus_amr_betalactam_betalactam_genes | String | Beta-lactam AMR genes identified by AMRFinderPlus that are known to confer resistance to beta-lactams | FASTA, ONT, PE, SE | +| amrfinderplus_amr_betalactam_carbapenem_genes | String | Beta-lactam AMR genes identified by AMRFinderPlus that are known to confer resistance to carbapenem | FASTA, ONT, PE, SE | +| amrfinderplus_amr_betalactam_cephalosporin_genes | String | Beta-lactam AMR genes identified by AMRFinderPlus that are known to confer resistance to cephalosporin | FASTA, ONT, PE, SE | +| amrfinderplus_amr_betalactam_cephalothin_genes | String | Beta-lactam AMR genes identified by AMRFinderPlus that are known to confer resistance to cephalothin | FASTA, ONT, PE, SE | +| amrfinderplus_amr_betalactam_genes | String | Beta-lactam AMR genes identified by AMRFinderPlus | FASTA, ONT, PE, SE | +| amrfinderplus_amr_betalactam_methicillin_genes | String | Beta-lactam AMR genes identified by AMRFinderPlus that are known to confer resistance to methicilin | FASTA, ONT, PE, SE | +| amrfinderplus_amr_classes | String | AMRFinderPlus predictions for classes of drugs that genes found in the reads are known to confer resistance to | FASTA, ONT, PE, SE | +| amrfinderplus_amr_core_genes | String | AMR genes identified by AMRFinderPlus where the scope is "core" | FASTA, ONT, PE, SE | +| amrfinderplus_amr_plus_genes | String | AMR genes identified by AMRFinderPlus where the scope is "plus" | FASTA, ONT, PE, SE | +| amrfinderplus_amr_report | File | TSV file detailing AMR genes only, from the amrfinderplus_all_report | FASTA, ONT, PE, SE | +| amrfinderplus_amr_subclasses | String | More specificity about the drugs that genes identified in the reads confer resistance to | FASTA, ONT, PE, SE | +| amrfinderplus_db_version | String | AMRFinderPlus database version used | FASTA, ONT, PE, SE | +| amrfinderplus_stress_genes | String | Stress genes identified by AMRFinderPlus | FASTA, ONT, PE, SE | +| amrfinderplus_stress_report | File | TSV file detailing stress genes only, from the amrfinderplus_all_report | FASTA, ONT, PE, SE | +| amrfinderplus_version | String | AMRFinderPlus version used | FASTA, ONT, PE, SE | +| amrfinderplus_virulence_genes | String | Virulence genes identified by AMRFinderPlus | FASTA, ONT, PE, SE | +| amrfinderplus_virulence_report | File | TSV file detailing virulence genes only, from the amrfinderplus_all_report | FASTA, ONT, PE, SE | +| ani_highest_percent | Float | Highest ANI between query and any given reference genome (top species match) | FASTA, ONT, PE, SE | +| ani_highest_percent_bases_aligned | Float | Percentage of bases aligned between query genome and top species match | FASTA, ONT, PE, SE | +| ani_mummer_docker | String | Docker image used to run the ANI_mummer task | FASTA, ONT, PE, SE | +| ani_mummer_version | String | Version of MUMmer used | FASTA, ONT, PE, SE | +| ani_output_tsv | File | Full output TSV from ani-m | FASTA, ONT, PE, SE | +| ani_top_species_match | String | Species of genome with highest ANI to query FASTA | FASTA, ONT, PE, SE | +| assembly_fasta | File | https://github.com/tseemann/shovill#contigsfa | ONT, PE, SE | +| assembly_length | Int | Length of assembly (total contig length) as determined by QUAST | FASTA, ONT, PE, SE | +| bakta_gbff | File | Genomic GenBank format annotation file | FASTA, ONT, PE, SE | +| bakta_gff3 | File | Generic Feature Format Version 3 file | FASTA, ONT, PE, SE | +| bakta_summary | File | Bakta summary output TXT file | FASTA, ONT, PE, SE | +| bakta_tsv | File | Annotations as simple human readable TSV | FASTA, ONT, PE, SE | +| bakta_version | String | Bakta version used | FASTA, ONT, PE, SE | +| bbduk_docker | String | BBDuk docker image used | PE, SE | +| busco_database | String | BUSCO database used | FASTA, ONT, PE, SE | +| busco_docker | String | BUSCO docker image used | FASTA, ONT, PE, SE | +| busco_report | File | A plain text summary of the results in BUSCO notation | FASTA, ONT, PE, SE | +| busco_results | String | BUSCO results (see https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) | FASTA, ONT, PE, SE | +| busco_version | String | BUSCO software version used | FASTA, ONT, PE, SE | +| cg_pipeline_docker | String | Docker file used for running CG-Pipeline on cleaned reads | PE, SE | +| cg_pipeline_report_clean | File | TSV file of read metrics from clean reads, including average read length, number of reads, and estimated genome coverage | PE, SE | +| cg_pipeline_report_raw | File | TSV file of read metrics from raw reads, including average read length, number of reads, and estimated genome coverage | PE, SE | +| clockwork_decontaminated_read1 | File | Decontaminated forward reads by Clockwork | PE | +| clockwork_decontaminated_read2 | File | Decontaminated reverse reads by Clockwork | PE | +| combined_mean_q_clean | Float | Mean quality score for the combined clean reads | PE | +| combined_mean_q_raw | Float | Mean quality score for the combined raw reads | PE | +| combined_mean_readlength_clean | Float | Mean read length for the combined clean reads | PE | +| combined_mean_readlength_raw | Float | Mean read length for the combined raw reads | PE | +| contigs_fastg | File | Assembly graph if megahit used for genome assembly | PE | +| contigs_gfa | File | Assembly graph if spades used for genome assembly | ONT, PE, SE | +| contigs_lastgraph | File | Assembly graph if velvet used for genome assembly | PE | +| dragonflye_version | String | Version of dragonflye used for de novo assembly | ONT | +| ectyper_predicted_serotype | String | Serotype predicted by ECTyper | FASTA, ONT, PE, SE | +| ectyper_results | File | TSV file of evidence for ECTyper predicted serotype (see https://github.com/phac-nml/ecoli_serotyping#report-format) | FASTA, ONT, PE, SE | +| ectyper_version | String | Version of ECTyper used | FASTA, ONT, PE, SE | +| emmtypingtool_docker | String | Docker image for emm-typing-tool | PE | +| emmtypingtool_emm_type | String | emm-type predicted | PE | +| emmtypingtool_results_xml | File | XML file with emm-typing-tool resuls | PE | +| emmtypingtool_version | String | Version of emm-typing-tool used | PE | +| est_coverage_clean | Float | Estimated coverage calculated from clean reads and genome length | ONT, PE, SE | +| est_coverage_raw | Float | Estimated coverage calculated from raw reads and genome length | ONT, PE, SE | +| fastp_html_report | File | The HTML report made with fastp | PE, SE | +| fastp_version | String | Version of fastp software used | PE, SE | +| fastq_scan_num_reads_clean_pairs | String | Number of read pairs after cleaning as calculated by fastq_scan | PE | +| fastq_scan_num_reads_clean1 | Int | Number of forward reads after cleaning as calculated by fastq_scan | PE, SE | +| fastq_scan_num_reads_clean2 | Int | Number of reverse reads after cleaning as calculated by fastq_scan | PE | +| fastq_scan_num_reads_raw_pairs | String | Number of input read pairs calculated by fastq_scan | PE | +| fastq_scan_num_reads_raw1 | Int | Number of input forward reads calculated by fastq_scan | PE, SE | +| fastq_scan_num_reads_raw2 | Int | Number of input reverse reads calculated by fastq_scan | PE | +| fastq_scan_version | String | Version of fastq-scan software used | PE, SE | +| fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | PE, SE | +| fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | PE | +| fastqc_docker | String | Docker container used with fastqc | PE, SE | +| fastqc_num_reads_clean_pairs | String | Number of read pairs after cleaning by fastqc | PE | +| fastqc_num_reads_clean1 | Int | Number of forward reads after cleaning by fastqc | PE, SE | +| fastqc_num_reads_clean2 | Int | Number of reverse reads after cleaning by fastqc | PE | +| fastqc_num_reads_raw_pairs | String | Number of input read pairs by fastqc | PE | +| fastqc_num_reads_raw1 | Int | Number of input reverse reads by fastqc | PE, SE | +| fastqc_num_reads_raw2 | Int | Number of input reverse reads by fastqc | PE | +| fastqc_raw1_html | File | Graphical visualization of raw forward read quality from fastqc to open in an internet browser | PE, SE | +| fastqc_raw2_html | File | Graphical visualization of raw reverse read qualityfrom fastqc to open in an internet browser | PE | +| fastqc_version | String | Version of fastqc software used | PE, SE | +| gambit_closest_genomes | File | CSV file listing genomes in the GAMBIT database that are most similar to the query assembly | FASTA, ONT, PE, SE | +| gambit_db_version | String | Version of GAMBIT used | FASTA, ONT, PE, SE | +| gambit_docker | String | GAMBIT docker file used | FASTA, ONT, PE, SE | +| gambit_predicted_taxon | String | Taxon predicted by GAMBIT | FASTA, ONT, PE, SE | +| gambit_predicted_taxon_rank | String | Taxon rank of GAMBIT taxon prediction | FASTA, ONT, PE, SE | +| gambit_report | File | GAMBIT report in a machine-readable format | FASTA, ONT, PE, SE | +| gambit_version | String | Version of GAMBIT software used | FASTA, ONT, PE, SE | +| genotyphi_final_genotype | String | Final genotype call from GenoTyphi | ONT, PE, SE | +| genotyphi_genotype_confidence | String | Confidence in the final genotype call made by GenoTyphi | ONT, PE, SE | +| genotyphi_mykrobe_json | File | JSON file of GenoTyphi output, described https://github.com/katholt/genotyphi#explanation-of-columns-in-the-output | ONT, PE, SE | +| genotyphi_report_tsv | File | TSV file of GenoTyphi output, described https://github.com/katholt/genotyphi#explanation-of-columns-in-the-output | ONT, PE, SE | +| genotyphi_species | String | Species call from Mykrobe, used to run GenoTyphi | ONT, PE, SE | +| genotyphi_st_probes_percent_coverage | Float | Percentage coverage to the Typhi MLST probes | ONT, PE, SE | +| genotyphi_version | String | Version of GenoTyphi used | ONT, PE, SE | +| hicap_docker | String | Docker image used for hicap | ONT, PE, SE | +| hicap_genes | String | cap genes identified. genes on different contigs delimited by;. truncation shown by trailing * | ONT, PE, SE | +| hicap_results_tsv | File | TSV file of hicap output | ONT, PE, SE | +| hicap_serotype | String | hicap serotype | ONT, PE, SE | +| hicap_version | String | hicap version used | ONT, PE, SE | +| kaptive_k_locus | String | Best matching K locus identified by Kaptive | FASTA, ONT, PE, SE | +| kaptive_k_type | String | Best matching K type identified by Kaptive | FASTA, ONT, PE, SE | +| kaptive_kl_confidence | String | Kaptive’s confidence in the KL match (see https://github.com/katholt/Kaptive/wiki/Interpreting-the-results) | FASTA, ONT, PE, SE | +| kaptive_oc_locus | String | Best matching K locus identified by Kaptive | FASTA, ONT, PE, SE | +| kaptive_ocl_confidence | String | Kaptive’s confidence in the OCL match (see https://github.com/katholt/Kaptive/wiki/Interpreting-the-results) | FASTA, ONT, PE, SE | +| kaptive_output_file_k | File | TSV https://github.com/katholt/Kaptive/wiki/How-to-run#output-filesfrom the K locus from Kaptive | FASTA, ONT, PE, SE | +| kaptive_output_file_oc | File | TSV https://github.com/katholt/Kaptive/wiki/How-to-run#output-filesfrom the OC locus from Kaptive | FASTA, ONT, PE, SE | +| kaptive_version | String | Version of Kaptive used | FASTA, ONT, PE, SE | +| kleborate_docker | String | Kleborate docker image used | FASTA, ONT, PE, SE | +| kleborate_genomic_resistance_mutations | String | Genomic resistance mutations identifies by Kleborate | FASTA, ONT, PE, SE | +| kleborate_key_resistance_genes | String | Key resistance genes identified by Kleborate | FASTA, ONT, PE, SE | +| kleborate_klocus | String | Best matching K locus identified by Kleborate via Kaptive | FASTA, ONT, PE, SE | +| kleborate_klocus_confidence | String | Kaptive’s confidence in the KL match (see https://github.com/katholt/Kaptive/wiki/Interpreting-the-results) | FASTA, ONT, PE, SE | +| kleborate_ktype | String | Best matching K type identified by Kleborate via Kaptive | FASTA, ONT, PE, SE | +| kleborate_mlst_sequence_type | String | https://github.com/katholt/Kleborate/wiki/MLST#multi-locus-sequence-typing-mlst call by Kleborate | FASTA, ONT, PE, SE | +| kleborate_olocus | String | Best matching OC locus identified by Kleborate via Kaptive | FASTA, ONT, PE, SE | +| kleborate_olocus_confidence | String | Kaptive’s confidence in the KL match (see https://github.com/katholt/Kaptive/wiki/Interpreting-the-results) | FASTA, ONT, PE, SE | +| kleborate_otype | String | Best matching OC type identified by Kleborate via Kaptive | FASTA, ONT, PE, SE | +| kleborate_output_file | File | https://github.com/katholt/Kleborate/wiki/Scores-and-counts | FASTA, ONT, PE, SE | +| kleborate_resistance_score | String | Resistance score as given by kleborate | FASTA, ONT, PE, SE | +| kleborate_version | String | Version of Kleborate used | FASTA, ONT, PE, SE | +| kleborate_virulence_score | String | Virulence score as given by kleborate | FASTA, ONT, PE, SE | +| kmerfinder_database | String | Database used to run KmerFinder | FASTA, ONT, PE, SE | +| kmerfinder_docker | String | Docker image used to run KmerFinder | FASTA, ONT, PE, SE | +| kmerfinder_query_coverage | String | KmerFinder’s query coverage of the top hit result | FASTA, ONT, PE, SE | +| kmerfinder_results_tsv | File | Output TSV file created by KmerFinder | FASTA, ONT, PE, SE | +| kmerfinder_template_coverage | String | | FASTA, ONT, PE, SE | +| kmerfinder_top_hit | String | Top hit species of KmerFinder | FASTA, ONT, PE, SE | +| kraken2_database | String | Kraken2 database used for the taxonomic assignment | ONT, PE, SE | +| kraken2_docker | String | Docker container for Kraken2 | ONT, PE, SE | +| kraken2_report | File | Report, in text format, of Kraken2 results | ONT, PE, SE | +| kraken2_version | String | Kraken2 version | ONT, PE, SE | +| legsta_predicted_sbt | String | Sequence based type predicted by Legsta | FASTA, ONT, PE, SE | +| legsta_results | File | TSV file of legsta results (see https://github.com/tseemann/legsta#output) | FASTA, ONT, PE, SE | +| legsta_version | String | Version of legsta used | FASTA, ONT, PE, SE | +| lissero_results | File | TSV results file from LisSero (see https://github.com/MDU-PHL/LisSero#example-output) | FASTA, ONT, PE, SE | +| lissero_serotype | String | Serotype predicted by LisSero | FASTA, ONT, PE, SE | +| lissero_version | String | Version of LisSero used | FASTA, ONT, PE, SE | +| meningotype_BAST | String | BAST type | FASTA, ONT, PE, SE | +| meningotype_FetA | String | FetA type | FASTA, ONT, PE, SE | +| meningotype_fHbp | String | fHbp type | FASTA, ONT, PE, SE | +| meningotype_NadA | String | NBA type | FASTA, ONT, PE, SE | +| meningotype_NHBA | String | NHBA type | FASTA, ONT, PE, SE | +| meningotype_PorA | String | PorA type | FASTA, ONT, PE, SE | +| meningotype_PorB | String | PorB type | FASTA, ONT, PE, SE | +| meningotype_serogroup | String | Serogroup | FASTA, ONT, PE, SE | +| meningotype_tsv | File | Full result file | FASTA, ONT, PE, SE | +| meningotype_version | String | Version of meningotype used | FASTA, ONT, PE, SE | +| midas_docker | String | MIDAS docker image used | PE, SE | +| midas_primary_genus | String | Genus of most abundant species in reads | PE, SE | +| midas_report | File | TSV report of full MIDAS results | PE, SE | +| midas_secondary_genus | String | Genus of the next most abundant species after removing all species of the most abundant genus | PE, SE | +| midas_secondary_genus_abundance | String | Relative abundance of secondary genus | PE, SE | +| midas_secondary_genus_coverage | String | Absolute coverage of secondary genus | PE, SE | +| n50_value | Int | N50 of assembly calculated by QUAST | FASTA, ONT, PE, SE | +| nanoplot_docker | String | Docker image for nanoplot | ONT | +| nanoplot_html_clean | File | Clean read file | ONT | +| nanoplot_html_raw | File | Raw read file | ONT | +| nanoplot_num_reads_clean1 | Int | Number of clean reads | ONT | +| nanoplot_num_reads_raw1 | Int | Number of raw reads | ONT | +| nanoplot_r1_est_coverage_clean | Float | Estimated coverage on the clean reads by nanoplot | ONT | +| nanoplot_r1_est_coverage_raw | Float | Estimated coverage on the raw reads by nanoplot | ONT | +| nanoplot_r1_mean_q_clean | Float | Mean quality score of clean forward reads | ONT | +| nanoplot_r1_mean_q_raw | Float | Mean quality score of raw forward reads | ONT | +| nanoplot_r1_mean_readlength_clean | Float | Mean read length of clean forward reads | ONT | +| nanoplot_r1_mean_readlength_raw | Float | Mean read length of raw forward reads | ONT | +| nanoplot_r1_median_q_clean | Float | Median quality score of clean forward reads | ONT | +| nanoplot_r1_median_q_raw | Float | Median quality score of raw forward reads | ONT | +| nanoplot_r1_median_readlength_clean | Float | Median read length of clean forward reads | ONT | +| nanoplot_r1_median_readlength_raw | Float | Median read length of raw forward reads | ONT | +| nanoplot_r1_n50_clean | Float | N50 of clean forward reads | ONT | +| nanoplot_r1_n50_raw | Float | N50 of raw forward reads | ONT | +| nanoplot_r1_stdev_readlength_clean | Float | Standard deviation read length of clean forward reads | ONT | +| nanoplot_r1_stdev_readlength_raw | Float | Standard deviation read length of raw forward reads | ONT | +| nanoplot_tsv_clean | File | Output TSV file created by nanoplot | ONT | +| nanoplot_tsv_raw | File | Output TSV file created by nanoplot | ONT | +| nanoplot_version | String | Version of nanoplot used for analysis | ONT | +| nanoq_version | String | Version of nanoq used in analysis | ONT | +| ngmaster_ngmast_porB_allele | String | porB allele number | FASTA, ONT, PE, SE | +| ngmaster_ngmast_sequence_type | String | NG-MAST sequence type | FASTA, ONT, PE, SE | +| ngmaster_ngmast_tbpB_allele | String | tbpB allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_23S_allele | String | 23S rRNA allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_gyrA_allele | String | gyrA allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_mtrR_allele | String | mtrR allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_parC_allele | String | parC allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_penA_allele | String | penA allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_ponA_allele | String | ponA allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_porB_allele | String | porB allele number | FASTA, ONT, PE, SE | +| ngmaster_ngstar_sequence_type | String | NG-STAR sequence type | FASTA, ONT, PE, SE | +| ngmaster_tsv | File | TSV file with NG-MAST/NG-STAR typing | FASTA, ONT, PE, SE | +| ngmaster_version | String | ngmaster version | FASTA, ONT, PE, SE | +| number_contigs | Int | Total number of contigs in assembly | FASTA, ONT, PE, SE | +| pasty_all_serogroups | File | TSV file with details of each serogroup from pasty (see https://github.com/rpetit3/pasty#example-prefixdetailstsv) | FASTA, ONT, PE, SE | +| pasty_blast_hits | File | TSV file of BLAST hits from pasty (see https://github.com/rpetit3/pasty#example-prefixblastntsv) | FASTA, ONT, PE, SE | +| pasty_comment | String | | FASTA, ONT, PE, SE | +| pasty_docker | String | pasty docker image used | FASTA, ONT, PE, SE | +| pasty_serogroup | String | Serogroup predicted by pasty | FASTA, ONT, PE, SE | +| pasty_serogroup_coverage | Float | The breadth of coverage of the O-antigen by pasty | FASTA, ONT, PE, SE | +| pasty_serogroup_fragments | Int | Number of BLAST hits included in the prediction (fewer is better) | FASTA, ONT, PE, SE | +| pasty_summary_tsv | File | TSV summary file of pasty outputs (see https://github.com/rpetit3/pasty#example-prefixtsv) | FASTA, ONT, PE, SE | +| pasty_version | String | Version of pasty used | FASTA, ONT, PE, SE | +| pbptyper_docker | String | pbptyper docker image used | FASTA, ONT, PE, SE | +| pbptyper_pbptype_predicted_tsv | File | TSV file of pbptyper results (see https://github.com/rpetit3/pbptyper#example-prefixtsv) | FASTA, ONT, PE, SE | +| pbptyper_predicted_1A_2B_2X | String | PBP type predicted by pbptyper | FASTA, ONT, PE, SE | +| pbptyper_version | String | Version of pbptyper used | FASTA, ONT, PE, SE | +| plasmidfinder_db_version | String | Version of PlasmidFnder used | FASTA, ONT, PE, SE | +| plasmidfinder_docker | String | PlasmidFinder docker image used | FASTA, ONT, PE, SE | +| plasmidfinder_plasmids | String | Names of plasmids identified by PlasmidFinder | FASTA, ONT, PE, SE | +| plasmidfinder_results | File | Output file from PlasmidFinder in TSV format | FASTA, ONT, PE, SE | +| plasmidfinder_seqs | File | Hit_in_genome_seq.fsa file produced by PlasmidFinder | FASTA, ONT, PE, SE | +| poppunk_docker | String | PopPUNK docker image with GPSC database used | FASTA, ONT, PE, SE | +| poppunk_gps_cluster | String | GPS cluster predicted by PopPUNK | FASTA, ONT, PE, SE | +| poppunk_GPS_db_version | String | Version of GPSC database used | FASTA, ONT, PE, SE | +| poppunk_gps_external_cluster_csv | File | GPSC v6 scheme designations | FASTA, ONT, PE, SE | +| poppunk_version | String | Version of PopPUNK used | FASTA, ONT, PE, SE | +| prokka_gbk | File | GenBank file produced from Prokka annotation of input FASTA | FASTA, ONT, PE, SE | +| prokka_gff | File | Prokka output GFF3 file containing sequence and annotation (you can view this in IGV) | FASTA, ONT, PE, SE | +| prokka_sqn | File | A Sequin file for GenBank submission | FASTA, ONT, PE, SE | +| qc_check | String | A string that indicates whether or not the sample passes a set of pre-determined and user-provided QC thresholds | FASTA, ONT, PE, SE | +| qc_standard | File | The user-provided file that contains the QC thresholds used for the QC check | FASTA, ONT, PE, SE | +| quast_gc_percent | Float | The GC percent of your sample | FASTA, ONT, PE, SE | +| quast_report | File | TSV report from QUAST | FASTA, ONT, PE, SE | +| quast_version | String | Software version of QUAST used | FASTA, ONT, PE, SE | +| r1_mean_q_clean | Float | Mean quality score of clean forward reads | PE, SE | +| r1_mean_q_raw | Float | Mean quality score of raw forward reads | PE, SE | +| r1_mean_readlength_clean | Float | Mean read length of clean forward reads | PE, SE | +| r1_mean_readlength_raw | Float | Mean read length of raw forward reads | PE, SE | +| r2_mean_q_clean | Float | Mean quality score of clean reverse reads | PE | +| r2_mean_q_raw | Float | Mean quality score of raw reverse reads | PE | +| r2_mean_readlength_clean | Float | Mean read length of clean reverse reads | PE | +| r2_mean_readlength_raw | Float | Mean read length of raw reverse reads | PE | +| rasusa_version | String | Version of RASUSA used for analysis | ONT | +| read_screen_clean | String | PASS or FAIL result from clean read screening; FAIL accompanied by the reason for failure | ONT, PE, SE | +| read_screen_raw | String | PASS or FAIL result from raw read screening; FAIL accompanied by thereason for failure | ONT, PE, SE | +| read1_clean | File | Clean forward reads file | ONT, PE, SE | +| read2_clean | File | Clean reverse reads file | PE | +| resfinder_db_version | String | Version of ResFinder database | FASTA, ONT, PE, SE | +| resfinder_docker | String | ResFinder docker image used | FASTA, ONT, PE, SE | +| resfinder_pheno_table | File | Table containing al AMR phenotypes | FASTA, ONT, PE, SE | +| resfinder_pheno_table_species | File | Table with species-specific AMR phenotypes | FASTA, ONT, PE, SE | +| resfinder_pointfinder_pheno_table | File | TSV showing presence(1)/absence(0) of predicted resistance against an antibiotic class | FASTA, ONT, PE, SE | +| resfinder_pointfinder_results | File | Predicted point mutations, grouped by the gene they occur in | FASTA, ONT, PE, SE | +| resfinder_predicted_pheno_resistance | String | Semicolon delimited list of antimicrobial drugs and associated genes and/or point mutations. : , , ; : , ; | FASTA, ONT, PE, SE | +| resfinder_predicted_resistance_Amp | String | States either Resistance or No Resistance predicted to Ampicillin based on resfinder phenotypic predictions | FASTA, ONT, PE, SE | +| resfinder_predicted_resistance_Axo | String | States either Resistance or No Resistance predicted to Ceftriaxone based on resfinder phenotypic predictions | FASTA, ONT, PE, SE | +| resfinder_predicted_resistance_Azm | String | States either Resistance or No Resistance predicted to Azithromycin based on resfinder phenotypic predictions | FASTA, ONT, PE, SE | +| resfinder_predicted_resistance_Cip | String | States either Resistance or No Resistance predicted to Ciprofloxacin based on resfinder phenotypic predictions | FASTA, ONT, PE, SE | +| resfinder_predicted_resistance_Smx | String | States either Resistance or No Resistance predicted to Sulfamethoxazole based on resfinder phenotypic predictions | FASTA, ONT, PE, SE | +| resfinder_predicted_resistance_Tmp | String | States either Resistance or No Resistance predicted to Trimothoprim based on resfinder phenotypic predictions | FASTA, ONT, PE, SE | +| resfinder_predicted_xdr_shigella | String | Final prediction of XDR Shigella status based on CDC definition. Explanation can be found in the description above this table. | FASTA, ONT, PE, SE | +| resfinder_results | File | Predicted resistance genes grouped by antibiotic class | FASTA, ONT, PE, SE | +| resfinder_seqs | File | FASTA of resistance gene sequences from user’s input sequence | FASTA, ONT, PE, SE | +| seq_platform | String | Sequencing platform input by the user | FASTA, ONT, PE, SE | +| seqsero2_predicted_antigenic_profile | String | Antigenic profile predicted for Salmonella spp by SeqSero2 | ONT, PE, SE | +| seqsero2_predicted_contamination | String | Indicates whether contamination between Salmonella with different serotypes was predicted by SeqSero2 | ONT, PE, SE | +| seqsero2_predicted_serotype | String | Serotype predicted by SeqSero2 | ONT, PE, SE | +| seqsero2_report | File | TSV report produced by SeqSero2 | ONT, PE, SE | +| seqsero2_version | String | Version of SeqSero2 used | ONT, PE, SE | +| seroba_ariba_identity | String | Percentage identity between the query sequence and ARIBA-predicted serotype | PE | +| seroba_ariba_serotype | String | Serotype predicted by ARIBA, via SeroBA | PE | +| seroba_details | File | Detailed TSV file from SeroBA | PE | +| seroba_docker | String | SeroBA docker image used | PE | +| seroba_serotype | String | Serotype predicted by SeroBA | PE | +| seroba_version | String | SeroBA version used | PE | +| serotypefinder_docker | String | SerotypeFinder docker image used | FASTA, ONT, PE, SE | +| serotypefinder_report | File | TSV report produced by SerotypeFinder | FASTA, ONT, PE, SE | +| serotypefinder_serotype | String | Serotype predicted by SerotypeFinder | FASTA, ONT, PE, SE | +| shigatyper_docker | String | ShigaTyper docker image used | ONT, PE, SE | +| shigatyper_hits_tsv | File | Detailed TSV report from ShigaTyper (seehttps://github.com/CFSAN-Biostatistics/shigatyper#example-prefix-hitstsv) | ONT, PE, SE | +| shigatyper_ipaB_presence_absence | String | Presence (+) or absence (-) of ipaB identified by ShigaTyper | ONT, PE, SE | +| shigatyper_notes | String | Any notes output from ShigaTyper | ONT, PE, SE | +| shigatyper_predicted_serotype | String | Serotype predicted by ShigaTyper | ONT, PE, SE | +| shigatyper_summary_tsv | File | TSV summary report from ShigaTyper (see https://github.com/CFSAN-Biostatistics/shigatyper#example-prefixtsv) | ONT, PE, SE | +| shigatyper_version | String | Version of ShigaTyper used | ONT, PE, SE | +| shigeifinder_cluster | String | Shigella/EIEC cluster identified by ShigEiFinder | FASTA, ONT, PE, SE | +| shigeifinder_cluster_reads | String | Shigella/EIEC cluster identified by ShigEiFinder using read files as inputs | PE, SE | +| shigeifinder_docker | String | ShigEiFinder docker image used | FASTA, ONT, PE, SE | +| shigeifinder_docker_reads | String | ShigEiFinder docker image used using read files as inputs | PE, SE | +| shigeifinder_H_antigen | String | H-antigen gene identified by ShigEiFinder | FASTA, ONT, PE, SE | +| shigeifinder_H_antigen_reads | String | H-antigen gene identified by ShigEiFinder using read files as inputs | PE, SE | +| shigeifinder_ipaH_presence_absence | String | Presence (+) or absence (-) of ipaH identified by ShigEiFinder | FASTA, ONT, PE, SE | +| shigeifinder_ipaH_presence_absence_reads | String | Presence (+) or absence (-) of ipaH identified by ShigEiFinder using read files as inputs | PE, SE | +| shigeifinder_notes | String | Any notes output from ShigEiFinder | FASTA, ONT, PE, SE | +| shigeifinder_notes_reads | String | Any notes output from ShigEiFinder using read files as inputs | PE, SE | +| shigeifinder_num_virulence_plasmid_genes | String | Number of virulence plasmid genes identified by ShigEiFinder | FASTA, ONT, PE, SE | +| shigeifinder_num_virulence_plasmid_genes_reads | String | Number of virulence plasmid genes identified by ShigEiFinder using read files as inputs | PE, SE | +| shigeifinder_O_antigen | String | O-antigen gene identified by ShigEiFinder | FASTA, ONT, PE, SE | +| shigeifinder_O_antigen_reads | String | O-antigen gene identified by ShigEiFinder using read files as inputs | PE, SE | +| shigeifinder_report | File | TSV report from ShigEiFinder (see https://github.com/LanLab/ShigEiFinder#shigeifinder) | FASTA, ONT, PE, SE | +| shigeifinder_report_reads | File | TSV report from ShigEiFinder (see https://github.com/LanLab/ShigEiFinder#shigeifinder) using read files as inputs | PE, SE | +| shigeifinder_serotype | String | Serotype predicted by ShigEiFinder | FASTA, ONT, PE, SE | +| shigeifinder_serotype_reads | String | Serotype predicted by ShigEiFinder using read files as inputs | PE, SE | +| shigeifinder_version | String | ShigEiFinder version used | FASTA, ONT, PE, SE | +| shigeifinder_version_reads | String | ShigEiFinder version used using read files as inputs | PE, SE | +| shovill_pe_version | String | Shovill version used | PE | +| shovill_se_version | String | Shovill version used | SE | +| sistr_allele_fasta | File | FASTA file of novel cgMLST alleles from SISTR | FASTA, ONT, PE, SE | +| sistr_allele_json | File | JSON file of cgMLST allele sequences and information (see https://github.com/phac-nml/sistr_cmd#cgmlst-allele-search-results) | FASTA, ONT, PE, SE | +| sistr_cgmlst | File | CSV file of the cgMLST allelic profile from SISTR (see https://github.com/phac-nml/sistr_cmd#cgmlst-allelic-profiles-output---cgmlst-profiles-cgmlst-profilescsv) | FASTA, ONT, PE, SE | +| sistr_predicted_serotype | String | Serotype predicted by SISTR | FASTA, ONT, PE, SE | +| sistr_results | File | TSV results file produced by SISTR (see https://github.com/phac-nml/sistr_cmd#primary-results-output--o-sistr-results) | FASTA, ONT, PE, SE | +| sistr_version | String | Version of SISTR used | FASTA, ONT, PE, SE | +| sonneityping_final_genotype | String | Final genotype call from Mykrobe, via sonneityper | ONT, PE, SE | +| sonneityping_final_report_tsv | File | Detailed TSV report from mykrobe, via sonneityper (see https://github.com/katholt/sonneityping#example-output) | ONT, PE, SE | +| sonneityping_genotype_confidence | String | Confidence in the final genotype call from sonneityper | ONT, PE, SE | +| sonneityping_genotype_name | String | Human readable alias for genotype, where available provided by sonneityper | ONT, PE, SE | +| sonneityping_mykrobe_docker | String | sonneityping docker image used | ONT, PE, SE | +| sonneityping_mykrobe_report_csv | File | CSV report from mykrobe via sonneityper (see https://github.com/Mykrobe-tools/mykrobe/wiki/AMR-prediction-output#csv-file) | ONT, PE, SE | +| sonneityping_mykrobe_report_json | File | JSON report from mykrobe via sonneityper (see https://github.com/Mykrobe-tools/mykrobe/wiki/AMR-prediction-output#json-file) | ONT, PE, SE | +| sonneityping_mykrobe_version | String | Version of sonneityping used | ONT, PE, SE | +| sonneityping_species | String | Species call from Mykrobe via sonneityping | ONT, PE, SE | +| spatyper_docker | String | spatyper docker image used | FASTA, ONT, PE, SE | +| spatyper_repeats | String | order of identified repeats | FASTA, ONT, PE, SE | +| spatyper_tsv | File | TSV report with spatyper results | FASTA, ONT, PE, SE | +| spatyper_type | String | spa type | FASTA, ONT, PE, SE | +| spatyper_version | String | spatyper version used | FASTA, ONT, PE, SE | +| srst2_vibrio_biotype | String | Biotype classification according to tcpA gene sequence (Classical or ElTor) | PE, SE | +| srst2_vibrio_ctxA | String | Presence or absence of the ctxA gene | PE, SE | +| srst2_vibrio_detailed_tsv | File | Detailed https://github.com/katholt/srst2 output file | PE, SE | +| srst2_vibrio_ompW | String | Presence or absence of the ompW gene | PE, SE | +| srst2_vibrio_serogroup | String | Serotype classification as O1 (wbeN gene), O139 (wbfR gene) or not detected. | PE, SE | +| srst2_vibrio_toxR | String | Presence or absence of the toxR gene | PE, SE | +| srst2_vibrio_version | String | The SRST2 version run | PE, SE | +| staphopiasccmec_docker | String | staphopia-sccmec docker image used | FASTA, ONT, PE, SE | +| staphopiasccmec_hamming_distance_tsv | File | staphopia-sccmec version | FASTA, ONT, PE, SE | +| staphopiasccmec_results_tsv | File | sccmec types and mecA presence | FASTA, ONT, PE, SE | +| staphopiasccmec_types_and_mecA_presence | String | staphopia-sccmec Hamming distance file | FASTA, ONT, PE, SE | +| staphopiasccmec_version | String | staphopia-sccmec presence and absence TSV file | FASTA, ONT, PE, SE | +| taxon_table_status | String | Status of the taxon table upload | FASTA, ONT, PE, SE | +| tbp_parser_average_genome_depth | Float | Optional output. Average genome depth across the reference genome | ONT, PE, SE | +| tbp_parser_coverage_report | File | Optional output. TSV file with breadth of coverage of each gene associated with antimicrobial resistance in mycobacterium tuberculosis. | ONT, PE, SE | +| tbp_parser_docker | String | Optional output. The docker image for tbp-parser | ONT, PE | +| tbp_parser_genome_percent_coverage | Float | Optional output. The percent of the genome covered at a depth greater than the specified minimum (default 10) | ONT, PE, SE | +| tbp_parser_laboratorian_report_csv | File | Optional output. Human-readable laboratorian report file containing the list of mutations found to be conferring resistance, both by WHO classification and expert rule implementation. The file contains the following columns: sample_id, tbprofiler_gene_name, tbprofiler_variant_locus_tag, tbprofiler_variant_substitution_type, tbprofiler_variant_substitution_nt, tbprofiler_variant_substitution_aa, confidence according to WHO, antimicrobial, depth, frequency, read_support, rationale ( WHO or expert rule), and warning if the coverage is below specified minimum (default 10) | ONT, PE, SE | +| tbp_parser_lims_report_csv | File | Optional output. LIMS digestable CSV report containing information on resistance for a set of antimicrobials ( No resistance to X detected, The detected genetic determinant(s) have uncertain significance, resistance to X cannot be ruled out and Genetic determinant(s) associated with resistance to X detected). For each antimicrobial, the mutations found are reported in the mutation_nucleotide; (mutation_protein) format, otherwise No mutations detected is reported. | ONT, PE, SE | +| tbp_parser_looker_report_csv | File | Optional output. Looker digestible CSV report containing information on resistance for a set of antimicrobials (R for resistant, S for susceptible) | ONT, PE, SE | +| tbp_parser_version | String | Optional output. The version of tbp-parser | ONT, PE | +| tbprofiler_dr_type | String | Drug resistance type predicted by TB-Profiler (sensitive, Pre-MDR, MDR, Pre-XDR, XDR) | ONT, PE, SE | +| tbprofiler_main_lineage | String | Lineage(s) predicted by TBProfiler | ONT, PE, SE | +| tbprofiler_median_coverage | Int | The median coverage of the H37Rv TB reference genome | ONT, PE | +| tbprofiler_output_bai | File | Index BAM file generated by mapping sequencing reads to reference genome by TBProfiler | ONT, PE, SE | +| tbprofiler_output_bam | File | BAM alignment file produced by TBProfiler | ONT, PE, SE | +| tbprofiler_output_file | File | CSV report from TBProfiler | ONT, PE, SE | +| tbprofiler_output_vcf | File | VCF file output from TBProfiler; the concatenation of all of the different VCF files produced during TBProfiler analysis | ONT, PE, SE | +| tbprofiler_pct_reads_mapped | Float | The percentage of reads mapped to the H37Rv TB reference genome | ONT, PE | +| tbprofiler_resistance_genes | String | List of resistance mutations detected by TBProfiler | ONT, PE, SE | +| tbprofiler_sub_lineage | String | Sub-lineage(s) predicted by TBProfiler | ONT, PE, SE | +| tbprofiler_version | String | Version of TBProfiler used | ONT, PE, SE | +| theiaprok_fasta_analysis_date | String | Date of TheiaProk FASTA workflow execution | FASTA | +| theiaprok_fasta_version | String | Version of TheiaProk FASTA workflow execution | FASTA | +| theiaprok_illumina_pe_analysis_date | String | Date of TheiaProk PE workflow execution | PE | +| theiaprok_illumina_pe_version | String | Version of TheiaProk PE workflow execution | PE | +| theiaprok_illumina_se_analysis_date | String | Date of TheiaProk SE workflow execution | SE | +| theiaprok_illumina_se_version | String | Version of TheiaProk SE workflow execution | SE | +| theiaprok_ont_analysis_date | String | Date of TheiaProk ONT workflow execution | ONT | +| theiaprok_ont_version | String | Version of TheiaProk ONT workflow execution | ONT | +| tiptoft_plasmid_replicon_fastq | File | File produced by tiptoft that contains reads containing plasmid rep/inc genes | ONT | +| tiptoft_plasmid_replicon_genes | String | Rep/inc genes found in sample | ONT | +| tiptoft_version | String | Version of tiptoft used for analysis | ONT | +| trimmomatic_docker | String | Docker image used for trimmomatic | PE, SE | +| trimmomatic_version | String | Version of trimmomatic used | PE, SE | +| ts_mlst_allelic_profile | String | Profile of MLST loci and allele numbers predicted by MLST | FASTA, ONT, PE, SE | +| ts_mlst_docker | String | Docker image used for MLST | FASTA, ONT, PE, SE | +| ts_mlst_novel_alleles | File | FASTA file containing nucleotide sequence of any alleles that are not in the MLST database used by TheiaProk | FASTA, ONT, PE, SE | +| ts_mlst_predicted_st | String | ST predicted by MLST | FASTA, ONT, PE, SE | +| ts_mlst_pubmlst_scheme | String | PubMLST scheme used byMLST | FASTA, ONT, PE, SE | +| ts_mlst_results | File | TSV report with detailed MLST profile, including https://github.com/tseemann/mlst#missing-data | FASTA, ONT, PE, SE | +| ts_mlst_version | String | Version of Torsten Seeman’s MLST tool used | FASTA, ONT, PE, SE | +| virulencefinder_docker | String | VirulenceFinder docker image used | FASTA, ONT, PE, SE | +| virulencefinder_hits | String | Virulence genes detected by VirulenceFinder | FASTA, ONT, PE, SE | +| virulencefinder_report_tsv | File | Output TSV file created by VirulenceFinder | FASTA, ONT, PE, SE | \ No newline at end of file diff --git a/docs/workflows/genomic_characterization/vadr_update.md b/docs/workflows/genomic_characterization/vadr_update.md new file mode 100644 index 000000000..4743222fb --- /dev/null +++ b/docs/workflows/genomic_characterization/vadr_update.md @@ -0,0 +1,56 @@ +# VADR_Update + +## Quick Facts + + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v1.2.1 | Yes | Sample-level | + +## Vadr_Update_PHB + +The VADR_Update workflow updates prior VADR assessments for each sample in line with the assessment criteria in an alternative docker image. This may be useful when samples have previously been subject to VADR alerts as updates to VADR assessment criteria may mean that the sample no longer raises concern about quality. The latest docker image SARS-CoV-2 for VADR can be found [here](https://www.notion.so/Docker-Image-and-Reference-Materials-for-SARS-CoV-2-Genomic-Characterization-98328c61f5cb4f77975f512b55d09108?pvs=21). + +Various models are available for many organisms. The following table provides an overview of the recommended container to be used and what options should be passed on to VADR. + +| **Organism** | **docker** | **vadr_opts** | max_length | +| --- | --- | --- | --- | +| sars-cov-2 | "us-docker.pkg.dev/general-theiagen/staphb/vadr:1.6.3" | "--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta" | 30000 | +| MPXV | "us-docker.pkg.dev/general-theiagen/staphb/vadr:1.6.3" | "--glsearch -s -r --nomisc --mkey mpxv --r_lowsimok --r_lowsimxd 100 --r_lowsimxl 2000 --alt_pass discontn,dupregin --out_allfasta --minimap2 --s_overhang 150" | 210000 | +| WNV | "us-docker.pkg.dev/general-theiagen/staphb/vadr:1.6.3" | "--mkey flavi --mdir /opt/vadr/vadr-models-flavi/ --nomisc --noprotid --out_allfasta" | 11000 | +| flu | "us-docker.pkg.dev/general-theiagen/staphb/vadr:1.6.3" | "--atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3 --mkey flu" | 13500 | +| rsv_a | "us-docker.pkg.dev/general-theiagen/staphb/vadr:1.6.3" | "-r --mkey rsv --xnocomp" | 15500 | +| rsv_b | "us-docker.pkg.dev/general-theiagen/staphb/vadr:1.6.3" | "-r --mkey rsv --xnocomp" | 15500 | +| HAV | "us-docker.pkg.dev/general-theiagen/staphb/vadr:1.6.3-hav" | "-r -xnocomp -mkey hav.vadr" | 10500 | + +### Inputs + +Please note the default values are for SARS-CoV-2. + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| vadr_update | **assembly_length_unambiguous** | Int | Number of unambiguous basecalls within the consensus assembly | | Required | +| vadr_update | **docker** | String | The Docker container to use for the task | | Required | +| vadr_update | **genome_fasta** | File | Consensus genome assembly | | Required | +| vadr | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| vadr | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| vadr | **max_length** | Int | Maximum length for the fasta-trim-terminal-ambigs.pl VADR script | 30000 | Optional | +| vadr | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| vadr | **min_length** | Int | Minimum length subsequence to possibly replace Ns for the fasta-trim-terminal-ambigs.pl VADR script | 50 | Optional | +| vadr | **skip_length** | Int | Minimum assembly length (unambiguous) to run vadr | 10000 | Optional | +| vadr | **vadr_opts** | String | Options for the v-annotate.pl VADR script | ''--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/'' | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| vadr_alerts_list | File | File containing all of the fatal alerts as determined by VADR | +| vadr_docker | String | Docker image used to run VADR | +| vadr_fastas_zip_archive | File | Archive file (in zip format) of all VADR outputs | +| vadr_num_alerts | String | Number of fatal alerts as determined by VADR | +| vadr_update_analysis_date | String | Date of analysis | +| vadr_update_version | String | Version of the Public Health Bioinformatics (PHB) repository used | diff --git a/docs/workflows/phylogenetic_construction/augur.md b/docs/workflows/phylogenetic_construction/augur.md new file mode 100644 index 000000000..7023d6f1f --- /dev/null +++ b/docs/workflows/phylogenetic_construction/augur.md @@ -0,0 +1,305 @@ +# Augur + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.1.0 | Yes | Sample-level, Set-level | + +## Augur Workflows + +Genomic Epidemiology is an important approach in the effort to understand and mitigate against disease transmission. An often-critical step in viral genomic epidemiology is the generation of phylogenetic trees to explore the genetic relationship between viruses on a local, regional, national or global scale. The Augur workflows, currently only targeted for viral pathogens, facilitate this process by generating files for the visualization of phylogenetic trees with accompanying metadata. + +Two workflows are offered: **Augur_Prep_PHB** and **Augur_PHB**. These must be run sequentially, respectively, to first prepare each individual sample for running Augur, and secondly to run Augur itself on the set of samples, generating the phylogenetic tree files with accompanying metadata. The outputs from these workflows can be visualized in [Auspice](https://docs.nextstrain.org/projects/auspice/en/latest/) and [UShER](https://github.com/yatisht/usher). + +!!! dna "**Helpful resources for epidemiological interpretation**" + + - [introduction to Nextstrain](https://www.cdc.gov/amd/training/covid-toolkit/module3-1.html) (which includes Auspice) + - guide to Nextstrain [interactive trees](https://www.cdc.gov/amd/training/covid-toolkit/module3-4.html) + - an [introduction to UShER](https://www.cdc.gov/amd/training/covid-toolkit/module3-3.html) + - a video about [how to read trees](https://www.cdc.gov/amd/training/covid-toolkit/module1-3.html) if this is new to you + - documentation on [how to identify SARS-CoV-2 recombinants](https://github.com/pha4ge/pipeline-resources/blob/main/docs/sc2-recombinants.md) + +### Augur_Prep_PHB + +The Augur_Prep_PHB workflow was written to prepare individual sample assemblies and their metadata for running the Augur_PHB analysis. + +#### Augur_Prep Inputs + +The Augur_Prep_PHB workflow takes assembly FASTA files and associated metadata formatted in a data table. FASTA files may be generated with one of the TheiaCoV Characterization workflows and should adhere to quality control guidelines, (e.g. [QC guidelines produced by PHA4GE](https://github.com/pha4ge/pipeline-resources/blob/main/docs/qc-solutions.md)). The metadata can be uploaded to Terra as TSV file, formatted as in [this example](https://docs.google.com/spreadsheets/d/1PF1u3R-ZGm53UiVsTlIcpg9Qk2dUJgtx/edit#gid=253517867). + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| augur_prep | **assembly** | File | Assembly/consensus file (single FASTA file per sample) | | Required | +| augur_prep | **collection_date** | String | Collection date of the sample | | Optional | +| augur_prep | **continent** | String | Continent where sample was collected | | Optional | +| augur_prep | **country** | String | Country where sample was collected | | Optional | +| augur_prep | **county** | String | County (or smaller locality) where sample was collected | | Optional | +| augur_prep | **nextclade_clade** | String | The Nextclade clade of the sample | | Optional | +| augur_prep | **pango_lineage** | String | The Pangolin lineage of the sample | | Optional | +| augur_prep | state | **String** | State (or province) where sample was collected | | Optional | +| prep_augur_metadata | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| prep_augur_metadata | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 10 | Optional | +| prep_augur_metadata | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | +| prep_augur_metadata | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 3 | Optional | +| prep_augur_metadata | **organism** | String | The organism to be analyzed in Augur; options: "sars-cov-2", "flu", "MPXV", "rsv-a", "rsv-b" | sars-cov-2 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +#### Augur_Prep Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| augur_metadata | File | TSV file of the metadata provided as input to the workflow in the proper format for Augur analysis | +| augur_prep_phb_analysis_date | String | Date of analysis | +| augur_prep_phb_version | String | Version of the Public Health Bioinformatics (PHB) repository used | + +### Augur_PHB + +!!! info "Helpful Hint" + You may have to generate phylogenies multiple times, running the Augur_PHB workflow, assessing results, and amending inputs to generate a final tree with sufficient diversity and high-quality data of interest. + +The Augur_PHB workflow takes a **set** of assembly/consensus files (FASTA format) and sample metadata files (TSV format) that have been reformatted using Augur_Prep_PHB and runs Augur to generate the phylogenetic tree files with accompanying metadata. Additionally, the workflow infers pairwise SNP distances. + +#### Augur Inputs + +The Augur_PHB workflow takes in a ***set*** of SARS-CoV-2 (or any other viral pathogen) FASTA and metadata files. If running the workflow via Terra, individual samples will need to be added to a set before running the workflow. Input FASTAs should meet QA metrics. Sets of FASTAs with highly discordant quality metrics may result in the inaccurate inference of genetic relatedness. There **must** be some sequence diversity among the set of input assemblies. If insufficient diversity is present, it may be necessary to add a more divergent sequence to the set. + +!!! dna "Optional Inputs" + There are **many** optional user inputs. For SARS-CoV-2, Flu, rsv-a, rsv-b, and mpxv, default values that mimic the NextStrain builds have been preselected. To use these defaults, you must write either `"sars-cov-2"`,`"flu"`, `"rsv-a"`, `"rsv-b"`, or `"mpxv"` for the `organism` variable. + + For Flu - it is **required** to set `flu_segment` to either `"HA"` or `"NA"` & `flu_subtype` to either `"H1N1"` or `"H3N2"` or `"Victoria"` or `"Yamagata"` depending on your set of samples. + +???+ toggle "A Note on Optional Inputs" + ??? toggle "Default values for SARS-CoV-2" + - min_num_unambig = 27000 + - clades_tsv = [defaults/clades.tsv](https://github.com/nextstrain/ncov/tree/23d1243127e8838a61b7e5c1a72bc419bf8c5a0d/defaults/clades.tsv) + - lat_longs_tsv = [defaults/lat_longs.tsv](https://github.com/nextstrain/ncov/blob/23d1243127e8838a61b7e5c1a72bc419bf8c5a0d/defaults/lat_longs.tsv) + - reference_fasta = [defaults/reference_seq.fasta](https://github.com/nextstrain/ncov/blob/23d1243127e8838a61b7e5c1a72bc419bf8c5a0d/defaults/reference_seq.fasta) + - reference_genbank = [defaults/reference_seq.gb](https://github.com/nextstrain/ncov/blob/23d1243127e8838a61b7e5c1a72bc419bf8c5a0d/defaults/reference_seq.gb) + - auspice_config = [defaults/auspice_config.json](https://github.com/nextstrain/ncov/blob/23d1243127e8838a61b7e5c1a72bc419bf8c5a0d/defaults/auspice_config.json) + - min_date = 2020.0 + - pivot_interval = 1 + - pivot_interval_units = "weeks" + - narrow_bandwidth = 0.05 + - proportion_wide = 0.0 + + ??? toggle "Default values for Flu" + - lat_longs_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv"` + - min_num_unambig = 900 + - min_date = 2020.0 + - pivot_interval = 1 + - narrow_bandwidth = 0.1666667 + - proportion_wide = 0.0 + ??? toggle "H1N1" + - auspice_config = `"gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h1n1pdm.json"` + - HA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.gb"` + - clades_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/clades_h1n1pdm_ha.tsv"` + - NA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.gb"` + ??? toggle "H3N2" + - auspice_config = `"gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h3n2.json"` + - HA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.gb"` + - clades_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/clades_h3n2_ha.tsv"` + - NA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.gb"` + ??? toggle "Victoria" + - auspice_config = `"gs://theiagen-public-files-rp/terra/flu-references/auspice_config_vic.json"` + - HA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.gb"` + - clades_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/clades_vic_ha.tsv"` + - NA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_na.gb"` + ??? toggle "Yamagata" + - auspice_config = `"gs://theiagen-public-files-rp/terra/flu-references/auspice_config_yam.json"` + - HA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_ha.gb"` + - clades_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/clades_yam_ha.tsv"` + - NA + - reference_fasta = `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_na.gb"` + + ??? toggle "Default values for MPXV" + - min_num_unambig = 150000 + - clades_tsv = `"gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_clades.tsv"` + - lat_longs_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv"` + - reference_fasta = `"gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1.reference.fasta"` + - reference_genbank = `"gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"` + - auspice_config = `"gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_auspice_config_mpxv.json"` + - min_date = 2020.0 + - pivot_interval = 1 + - narrow_bandwidth = 0.1666667 + - proportion_wide = 0.0 + + ??? toggle "Default values for RSV-A" + - min_num_unambig = 10850 + - clades_tsv = `"gs://theiagen-public-files-rp/terra/rsv_references/rsv_a_clades.tsv"` + - lat_longs_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv"` + - reference_fasta = `"gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta"` + - reference_genbank = `""gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.gb"` + - auspice_config = `""gs://theiagen-public-files-rp/terra/rsv_references/rsv_auspice_config.json"` + - min_date = 2020.0 + - pivot_interval = 1 + - narrow_bandwidth = 0.1666667 + - proportion_wide = 0.0 + + ??? toggle "Default values for RSV-B" + - min_num_unambig = 10850 + - clades_tsv = `"gs://theiagen-public-files-rp/terra/rsv_references/rsv_b_clades.tsv"` + - lat_longs_tsv = `"gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv"` + - reference_fasta = `"gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta"` + - reference_genbank = `""gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.gb"` + - auspice_config = `""gs://theiagen-public-files-rp/terra/rsv_references/rsv_auspice_config.json"` + - min_date = 2020.0 + - pivot_interval = 1 + - narrow_bandwidth = 0.1666667 + - proportion_wide = 0.0 + + For more information regarding these optional inputs, please view [Nextrain's detailed documentation on Augur](https://docs.nextstrain.org/projects/augur/en/stable/usage/usage.html) + + !!! info "What's required or not?" + For organisms _other_ than SARS-CoV-2 or Flu, the required variables have both the "required" and "optional" tags. + +This workflow runs on the set level. Please note that for every task, runtime parameters are modifiable (cpu, disk_size, docker, and memory); most of these values have been excluded from the table below for convenience. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| augur | **assembly_fastas** | Array[File] | An array of the assembly files to use; use either the HA or NA segment for flu samples | | Required | +| augur | **build_name** | String | Name to give to the Augur build | | Required | +| augur | **auspice_config** | File | Auspice config file for customizing visualizations; takes priority over the other customization values available for augur_export | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, a minimal auspice config file is provided to prevent workflow failure, "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-auspice-config.json", but will not be as useful as an organism specific config file. | Optional | +| augur | **clades_tsv** | File | TSV file containing clade mutation positions in four columns | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, an empty clades file is provided to prevent workflow failure, "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-clades.tsv", but will not be as useful as an organism specific clades file. | Optional, Required | +| augur | **distance_tree_only** | Boolean | Create only a distance tree (skips all Augur steps after augur_tree) | TRUE | Optional | +| augur | **flu_segment** | String | Required if organism = "flu". The name of the segment to be analyzed; options: "HA" or "NA" | "HA" (only used if organism = "flu") | Optional, Required | +| augur | **flu_subtype** | String | Required if organism = "flu". The subtype of the flu samples being analyzed; options: "H1N1", "H3N2", "Victoria", "Yamagata" | | Optional, Required | +| augur | **lat_longs_tsv** | File | Tab-delimited file of geographic location names with corresponding latitude and longitude values | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, a minimal lat-long file is provided to prevent workflow failure, "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-lat-longs.tsv", but will not be as useful as a detailed lat-longs file covering all the locations for the samples to be visualized. | Optional | +| augur | **min_date** | Float | Minimum date to begin filtering or frequencies calculations | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the default value is 0.0 | Optional | +| augur | **min_num_unambig** | Int | Minimum number of called bases in genome to pass prefilter | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the default value is 0 | Optional | +| augur | **organism** | String | Organism used to preselect default values; options: "sars-cov-2", "flu", "mpxv", "rsv-a", "rsv-b" | sars-cov-2 | Optional | +| augur | **reference_fasta** | File | The reference FASTA file used to align the genomes and build the trees | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, a reference fasta file must be provided otherwise the workflow fails. | Optional, Required | +| augur | **reference_genbank** | File | The GenBank .gb file for the same reference genome used for the reference_fasta | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: l. For an organism without set defaults, a reference genbank file must be provided otherwise the workflow fails. | Optional, Required | +| augur | **sample_metadata_tsvs** | Array[File] | An array of the metadata files produced in Augur_Prep_PHB | | Optional | +| augur | **build_name_updated** | String | Internal component, do not modify. Used for replacing spaces with underscores _ | | Do Not Modify | +| augur_align | **fill_gaps** | Boolean | If true, gaps represent missing data rather than true indels and so are replaced by N after aligning. | FALSE | Optional | +| augur_ancestral | **infer_ambiguous** | Boolean | If true, infer nucleotides and ambiguous sites and replace with most likely | FALSE | Optional | +| augur_ancestral | **inference** | String | Calculate joint or marginal maximum likelihood ancestral sequence states; options: "joint", "marginal" | joint | Optional | +| augur_ancestral | **keep_ambiguous** | Boolean | If true, do not infer nucleotides at ambiguous (N) sides | FALSE | Optional | +| augur_ancestral | **keep_overhangs** | Boolean | If true, do not infer nucleotides for gaps on either side of the alignment | FALSE | Optional | +| augur_export | **colors_tsv** | File | Custom color definitions, one per line in the format TRAIT_TYPE \| TRAIT_VALUE\tHEX_CODE | | Optional | +| augur_export | **description_md** | File | Markdown file with description of build and/or acknowledgements | | Optional | +| augur_export | **include_root_sequence** | Boolean | Export an additional JSON containing the root sequence used to identify mutations | FALSE | Optional | +| augur_export | **title** | String | Title to be displayed by Auspice | | Optional | +| augur_refine | **branch_length_inference** | String | Branch length mode of timetree to use; options: "auto", "joint", "marginal", "input" | auto | Optional | +| augur_refine | **clock_filter_iqd** | Int | Remove tips that deviate more than n_iqd interquartile ranges from the root-to-tip vs time regression | 4 | Optional | +| augur_refine | **clock_rate** | Float | Fixed clock rate to use for time tree calculations | | Optional | +| augur_refine | **clock_std_dev** | Float | Standard deviation of the fixed clock_rate estimate | | Optional | +| augur_refine | **coalescent** | String | Coalescent time scale in units of inverse clock rate (float), optimize as scalar ("opt") or skyline ("skyline") | | Optional | +| augur_refine | **covariance** | Boolean | If true, account for covariation when estimating rates and/or rerooting | TRUE | Optional | +| augur_refine | **date_confidence** | Boolean | If true, calculate confidence intervals for node dates | TRUE | Optional | +| augur_refine | **date_inference** | String | Assign internal nodes to their marginally most likely dates; options: "joint", "marginal" | marginal | Optional | +| augur_refine | **divergence_units** | String | Units in which sequence divergences is exported; options: "mutations" or "mutations-per-site" | mutations | Optional | +| augur_refine | **gen_per_year** | Int | Number of generations per year | 50 | Optional | +| augur_refine | **keep_polytomies** | Boolean | If true, don't attempt to resolve polytomies | FALSE | Optional | +| augur_refine | **keep_root** | Boolean | If true, do not reroot the tree; use it as-is (overrides anything specified by root) | TRUE | Optional | +| augur_refine | **precision** | String | Precision used to determine the number of grid points; options: 0 (rough) to 3 (ultra fine) | auto | Optional | +| augur_refine | **root** | String | Rooting mechanism; options: "best", "least-squares", "min_dev", "oldest", etc. | | Optional | +| augur_translate | **genes** | File | A file containing a list of genes to translate (from nucleotides to amino acids) | | Optional | +| augur_tree | **exclude_sites** | File | File of one-based sites to exclude for raw tree building (BED format in .bed files, DRM format in tab-delimited files, or one position per line) | | Optional | +| augur_tree | **method** | String | Which method to use to build the tree; options: "fasttree", "raxml", "iqtree" | iqtree | Optional | +| augur_tree | **override_default_args** | Boolean | If true, override default tree builder arguments instead of augmenting them | FALSE | Optional | +| augur_tree | **substitution_model** | String | The substitution model to use; only available for iqtree. Specify "auto" to run ModelTest; options: "GTR" | GTR | Optional | +| augur_tree | **tree_builder_args** | String | Additional tree builder arguments either augmenting or overriding the default arguments. FastTree defaults: "-nt -nosupport". RAxML defaults: "-f d -m GTRCAT -c 25 -p 235813". IQ-TREE defaults: "-ninit 2 -n 2 -me 0.05 -nt AUTO -redo" | | Optional | +| sc2_defaults | **nextstrain_ncov_repo_commit** | String | The version of the from which to draw default values for SARS-CoV-2. | `23d1243127e8838a61b7e5c1a72bc419bf8c5a0d` | Optional | +| organism_parameters | **gene_locations_bed_file** | File | Use to provide locations of interest where average coverage will be calculated | Defaults are organism-specific. Please find default values for some organisms here: . For an organism without set defaults, an empty file is provided, "gs://theiagen-public-files/terra/theiacov-files/empty.bed", but will not be as useful as an organism specific gene locations bed file. | Optional | +| organism_parameters | **genome_length_input** | Int | Use to specify the expected genome length; provided by default for all supported organisms | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the genome length input must be provided otherwise the workflow fails. | Optional, Required | +| organism_parameters | **hiv_primer_version** | String | The version of HIV primers used. Options are and . This input is ignored if provided for TheiaCoV_Illumina_SE and TheiaCoV_ClearLabs | v1 | Optional | +| organism_parameters | **kraken_target_organism_input** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, the default is "". | Optional | +| organism_parameters | **nextclade_dataset_name_input** | String | NextClade organism dataset name | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the default is "NA". | Optional | +| organism_parameters | **nextclade_dataset_tag_input** | String | NextClade organism dataset tag | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the default is "NA". | Optional | +| organism_parameters | **pangolin_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.26 | Optional | +| organism_parameters | **primer_bed_file** | File | The bed file containing the primers used when sequencing was performed | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, an empty primer bed file is provided, "gs://theiagen-public-files/terra/theiacov-files/empty.bed", but will not be as useful as an organism specific primer bed file. | Optional | +| organism_parameters | **reference_gff_file** | File | Reference GFF file for the organism being analyzed | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, an empty gff file is provided, "gs://theiagen-public-files/terra/theiacov-files/empty.gff3", but will not be as useful as an organism specific gff file. | Optional | +| organism_parameters | **vadr_max_length** | Int | Maximum length for the `fasta-trim-terminal-ambigs.pl` VADR script | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, the default is 0. | Optional | +| organism_parameters | **vadr_mem** | Int | Memory, in GB, allocated to this task | 32 (RSV-A and RSV-B) and 8 (all other TheiaCoV organisms) | | +| organism_parameters | **vadr_options** | String | Options for the `v-annotate.pl` VADR script | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, the default is "NA". | Optional | +| organism_parameters | **vadr_skip_length** | Int | Minimum assembly length (unambiguous) to run VADR | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, the default is 0. | Optional | +| mutation_context | **cpu** | Int | CPUs requested for the mutation_context task that is specific to Mpox. | 1 | Optional | +| mutation_context | **disk_size** | Int | Disk size in GB requested for the mutation_context task that is specific to Mpox. | 50 | Optional | +| mutation_context | **docker** | String | Docker image used for the mutation_context task that is specific to Mpox. Do not modify. | us-docker.pkg.dev/general-theiagen/theiagen/nextstrain-mpox-mutation-context:2024-06-27 | Do Not Modify, Optional | +| mutation_context | **memory** | Int | Memory size in GB requested for the mutation_context task that is specific to Mpox. | 4 | Optional | + +??? task "Workflow Tasks" + ##### Augur Workflow Tasks {#augur-tasks} + + The Augur_PHB workflow uses the inputs to generate a phylogenetic tree in JSON format that is compatible with phylogenetic tree visualization software. + + In Augur_PHB, the tasks below are called. For the Augur subcommands, please view the [Nextstrain Augur documentation](https://docs.nextstrain.org/projects/augur/en/stable/usage/usage.html) for more details and explanations. + + 1. `cat_files` - concatenate all of the input fasta files together + 2. `sc2_defaults` - if organism is SARS-CoV-2, establish default parameters + 3. `flu_defaults` - if organism is Flu, establish default parameters + 4. `filter_sequences_by_length` - remove any sequences that do not meet the quality threshold set by `min_num_unambig` + 5. `tsv_join` - merge the metadata files + 6. `fasta_to_ids` - extract a list of remaining sequences so we know which ones were dropped + 7. `augur_align` - perform MAFFT alignment on the sequences + 8. `augur_tree` - create a distance tree + 9. `augur_refine` - create a timetree + 10. `augur_ancestral` - infer ancestral sequences + 11. `augur_translate` - translate gene regions from nucleotides to amino acids + 12. `mutation_context` - if organism is MPXV, calculates the mutation fraction of G->A or C->T changes + 13. `augur_clades` - if clade information is provided, assign clades to nodes based on amino-acid or nucleotide signatures + 14. `augur_export` - export all the results in a JSON file suitable for Auspice visualization + 15. `snp_dists` - create a SNP matrix from the alignment + 16. `reorder_matrix` - reorder the SNP matrix to match the distance tree + +#### Augur Outputs + +!!! dna "Diversity dependent" + Note that the node & branch coloring by clade or lineage assignment might be dependent on the diversity of your input dataset. This is because the clade assignment is done using the ancestrally reconstructed amino acid or nucleotide changes at the tree nodes rather than a direct sequence-to-reference mutation comparison. You may notice this happening when you get clade/lineage assignments from NextClade when running TheiaCoV workflows, but no clade/lineage assignment on the Augur Auspice tree. + + To get around this issue, you can upload the Augur output file `merged-metadata.tsv` to Auspice that includes the correct clade/lineage assignments to allow for coloring by Clade. + +!!! dna "Flu clade assignments" + Note that for flu, the clade assignment is usually mostly done for the more recent seasonal influenza viruses. Older strains may get an "unassigned" designation for clades. Therefore, it is important to counter check with the NextClade results from TheiaCoV if the lack of clade assignment is due to analyzing older sequences or sequence related. + +The `auspice_input_json` is intended to be uploaded to [Auspice](https://auspice.us/) to view the phylogenetic tree. This provides a visualization of the genetic relationships between your set of samples. The `metadata_merged` output can also be uploaded to add context to the phylogenetic visualization. The `combined_assemblies` output can be uploaded to [UShER](https://genome.ucsc.edu/cgi-bin/hgPhyloPlace) to view the samples on a global tree of representative sequences from the public repositories. + +The Nextstrain team hosts documentation surrounding the Augur workflow → Auspice visualization here, which details the various components of the Auspice interface: [How data is exported by Augur for visualisation in Auspice](https://docs.nextstrain.org/en/latest/learn/augur-to-auspice.html). + +| **Variable** | **Type** | **Description** | +| --- | --- | --- | +| aligned_fastas | File | A FASTA file of the aligned genomes | +| augur_phb_analysis_date | String | The date the analysis was run | +| augur_phb_version | String | The version of the Public Health Bioinformatics (PHB) repository used | +| augur_version | String | Version of Augur used | +| auspice_input_json | File | JSON file used as input to Auspice | +| combined_assemblies | File | Concatenated FASTA file containing all samples | +| distance_tree | File | The distance tree created in Newick (.nwk) format | +| keep_list | File | A list of samples included in the phylogenetic tree | +| metadata_merged | File | Tab-delimited text file of the merged augur_metadata input files from all samples | +| snp_matrix | File | The SNP distance matrix for all samples used in the phylogenetic tree | +| time_tree | File | The time tree created in Newick (.nwk) format | +| traits_json | File | A JSON file containing sample traits | + +#### Mpox-specific Auspice Output JSON + +If you are building a tree for Mpox samples and set the optional input parameter `organism` to `"mpox"` , an additional step will be carried out in the Augur_PHB workflow. This additional step will calculate the mutation fraction of G→A or C→T changes. These mutations have been shown to be a characteristic of APOBEC3-type editing, which indicate adaptation of the virus to circulation among humans as was observed with the 2022 clade IIb outbreak, and more recently (2024) with the clade Ib outbreak in South Kivu, Democratic Republic of the Congo. + +When visualizing the output `auspice_input_json` file, there will be 2 new choices in the drop-down menu for "Color By": + +- G→A or C→T fraction +- NGA/TCN context of G→A or C→T mutations. + +An example Mpox tree with these "Color By" options can be viewed here: + +### References + +When publishing work using the Augur_PHB workflow, please reference the following: + +> Nextstrain: Hadfield J, Megill C, Bell SM, Huddleston J, Potter B, Callender C, Sagulenko P, Bedford T, Neher RA. Nextstrain: real-time tracking of pathogen evolution. Bioinformatics. 2018 Dec 1;34(23):4121-3. + +When publishing work using inferences from UShER, please reference: + +> UShER: Turakhia Y, Thornlow B, Hinrichs AS, De Maio N, Gozashti L, Lanfear R, Haussler D, Corbett-Detig R. Ultrafast Sample placement on Existing tRees (UShER) enables real-time phylogenetics for the SARS-CoV-2 pandemic. Nature Genetics. 2021 Jun;53(6):809-16. diff --git a/docs/workflows/phylogenetic_construction/core_gene_snp.md b/docs/workflows/phylogenetic_construction/core_gene_snp.md new file mode 100644 index 000000000..197dcb58e --- /dev/null +++ b/docs/workflows/phylogenetic_construction/core_gene_snp.md @@ -0,0 +1,129 @@ +# Core_Gene_SNP + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.1.0 | Yes, some optional features incompatible | Set-level | + +## Core_Gene_SNP_PHB + +!!! caption "Core Gene SNP Workflow Diagram" + ![Core Gene SNP Workflow Diagram](../../assets/figures/Core_Gene_SNP.png){width:45%} + +The Core_Gene_SNP workflow is intended for pangenome analysis, core gene alignment, and phylogenetic analysis. The workflow takes in gene sequence data in GFF3 format from a set of samples. It first produces a pangenome summary using [`Pirate`](https://github.com/SionBayliss/PIRATE), which clusters genes within the sample set into orthologous gene families. By default, the workflow also instructs `Pirate` to produce both core gene and pangenome alignments. The workflow subsequently triggers the generation of a phylogenetic tree and SNP distance matrix from the core gene alignment using [`iqtree`](https://github.com/iqtree/iqtree2/tree/v1.6.7) and [`snp-dists`](https://github.com/tseemann/snp-dists), respectively. Optionally, the workflow will also run this analysis using the pangenome alignment. This workflow also features an optional module, `summarize_data`, that creates a presence/absence matrix for the analyzed samples from a list of indicated columns (such as AMR genes, etc.) that can be used in Phandango. + +!!! info "Default Parameters" + Please note that while default parameters for pangenome construction and phylogenetic tree generation are provided, **these default parameters may not suit every dataset and have not been validated against known phylogenies**. Users should take care to select the parameters that are most appropriate for their dataset. Please reach out to [support@theiagen.com](mailto:support@theiagen.com) or one of the other resources listed at the bottom of this page if you would like assistance with this task. + +### Inputs + +### Optional User Inputs + +For further detail regarding Pirate options, please see [PIRATE's documentation)[https://github.com/SionBayliss/PIRATE). For further detail regarding IQ-TREE options, please see `http://www.iqtree.org/doc/Command-Reference`. + +This workflow runs on the set level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| core_gene_snp_workflow | **cluster_name** | String | Name of sample set | | Required | +| core_gene_snp_workflow | **gff3** | Array[File] | Array of gff3 files to include in analysis, output gff files from both prokka and bakta using TheiaProk workflows are compatible | | Required | +| core_gene_snp_workflow | **midpoint_root_tree** | Boolean | Boolean variable that will instruct the workflow to reroot the tree at the midpoint | FALSE | Optional | +| core_gene_snp_workflow | **phandango_coloring** | Boolean | Boolean variable that tells the data summary task and the reorder matrix task to include a suffix that enables consistent coloring on Phandango; by default, this suffix is not added. To add this suffix set this variable to true. | FALSE | Optional | +| core_gene_snp_workflow | **data_summary_terra_table** | String | The name of the Terra data table that you want data pulled from | | Optional | +| core_gene_snp_workflow | **data_summary_column_names** | String | A comma-delimited list of columns in the origin data table that contains contain that you would like a presence/absence .csv matrix generated for | | Optional | +| core_gene_snp_workflow | **core_tree** | Boolean | Boolean variable that instructs the workflow to create a phylogenetic tree and SNP distance matrix from the core gene alignment. Align must also be set to true. | TRUE | Optional | +| core_gene_snp_workflow | **pan_tree** | Boolean | Boolean variable that instructs the workflow to create a phylogenetic tree and SNP distance matrix from the pangenome alignment. Align must also be set to true. | FALSE | Optional | +| core_gene_snp_workflow | **data_summary_terra_workspace** | String | The name of the current Terra workspace you are in; this can be found at the top of the webpage, or in the URL after the billing project. | | Optional | +| core_gene_snp_workflow | **align** | Boolean | Boolean variable that instructs the workflow to generate core and pangenome alignments if "true". If "false", the workflow will produce only a pangenome summary. | TRUE | Optional | +| core_gene_snp_workflow | **data_summary_terra_project** | String | The billing project for the current workspace; can be found after the "#workspaces/" section in the workflow's URL | | Optional | +| core_gene_snp_workflow | **sample_names** | Array[String] | Array of sample_ids from the data table used | | Optional | +| core_iqtree | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| core_iqtree | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| core_iqtree | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| core_iqtree | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/iqtree:1.6.7 | Optional | +| core_iqtree | **iqtree_model** | String | Substitution model, frequency type (optional) and rate heterogeneity type (optional) used by IQ-TREE. This string follows the IQ-TREE "-m" option. For comparison to other tools use HKY for Bactopia, GTR+F+I for Grandeur, GTR+G4 for Nullarbor, GTR+G for Dryad | GTR+I+G | Optional | +| core_iqtree | **iqtree_opts** | String | Additional options for IQ-TREE, see | | Optional | +| core_iqtree | **iqtree_bootstraps** | String | Number of ultrafast bootstrap replicates. Follows IQ-TREE "-bb" option. | 1000 | Optional | +| core_iqtree | **alrt** | String | Number of replicates to perform SH-like approximate likelihood ratio test (SH-aLRT). Follows IQ-TREE "-alrt" option | 1000 | Optional | +| core_reorder_matrix | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| core_reorder_matrix | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| core_reorder_matrix | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | +| core_reorder_matrix | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| core_snp_dists | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| core_snp_dists | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2 | Optional | +| core_snp_dists | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| core_snp_dists | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| pan_iqtree | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| pan_iqtree | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| pan_iqtree | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| pan_iqtree | **alrt** | String | Number of replicates to perform SH-like approximate likelihood ratio test (SH-aLRT). Follows IQ-TREE "-alrt" option | 1000 | Optional | +| pan_iqtree | **iqtree_model** | String | Substitution model, frequency type (optional) and rate heterogeneity type (optional) used by IQ-TREE. This string follows the IQ-TREE "-m" option. For comparison to other tools use HKY for Bactopia, GTR+F+I for Grandeur, GTR+G4 for Nullarbor, GTR+G for Dryad | GTR+I+G | Optional | +| pan_iqtree | **iqtree_bootstraps** | String | Number of ultrafast bootstrap replicates. Follows IQ-TREE "-bb" option. | 1000 | Optional | +| pan_iqtree | **iqtree_opts** | String | Additional options for IQ-TREE, see | | Optional | +| pan_iqtree | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/iqtree:1.6.7 | Optional | +| pan_reorder_matrix | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| pan_reorder_matrix | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| pan_reorder_matrix | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | +| pan_reorder_matrix | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| pan_snp_dists | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| pan_snp_dists | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| pan_snp_dists | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2 | Optional | +| pan_snp_dists | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| pirate | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| pirate | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| pirate | **nucl** | Boolean | Boolean variable that instructs pirate to create a pangenome on CDS features using nucleotide identity, rather than amino acid identity, if true. | FALSE | Optional | +| pirate | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| pirate | **panopt** | String | Additional arguments for Pirate | | Optional | +| pirate | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/pirate:1.0.5--hdfd78af_0 | Optional | +| pirate | **features** | String | Features to use for pangenome construction [default: CDS] | CDS | Optional | +| pirate | **steps** | String | Identity thresholds to use for pangenome construction | 50,60,70,80,90,95,98 | Optional | +| summarize_data | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| summarize_data | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | +| summarize_data | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 1 | Optional | +| summarize_data | **id_column_name** | String | Use in the case your sample IDs are not in the table ID column | 1 | Optional | +| summarize_data | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Tasks + +By default, the Core_Gene_SNP workflow will begin by analyzing the input sample set using [PIRATE](https://github.com/SionBayliss/PIRATE). Pirate takes in GFF3 files and classifies the genes into gene families by sequence identity, outputting a pangenome summary file. The workflow will instruct Pirate to create core gene and pangenome alignments using this gene family data. Setting the "align" input variable to false will turn off this behavior, and the workflow will output only the pangenome summary. The workflow will then use the core gene alignment from `Pirate` to infer a phylogenetic tree using `IQ-TREE`. It will also produce an SNP distance matrix from this alignment using [snp-dists](https://github.com/tseemann/snp-dists). This behavior can be turned off by setting the `core_tree` input variable to false. The workflow will not create a pangenome tree or SNP-matrix by default, but this behavior can be turned on by setting the `pan_tree` input variable to true. + +The optional `summarize_data` task performs the following only if all of the `data_summary_*` and `sample_names` optional variables are filled out: + +1. Digests a _comma-separated_ list of column names, such as `"amrfinderplus_virulence_genes,amrfinderplus_stress_genes"`, etc. that can be found within the origin Terra data table. +2. It will then parse through those column contents and extract each value; for example, if the `amrfinder_amr_genes` column for a sample contains these values: `"aph(3')-IIIa,tet(O),blaOXA-193"`, the `summarize_data` task will check each sample in the set to see if they also have those AMR genes detected. +3. Outputs a .csv file that indicates presence (TRUE) or absence (empty) for each item in those columns; that is, it will check each sample in the set against the detected items in each column to see if that value was also detected. + +By default, this task appends a Phandango coloring tag to color all items from the same column the same; this can be turned off by setting the optional `phandango_coloring` variable to `false`. + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| core_gene_snp_wf_analysis_date | String | Date of analysis using Core_Gene_SNP workflow | +| core_gene_snp_wf_version | String | Version of PHBG used for analysis | +| pirate_core_alignment_fasta | File | Nucleotide alignments of the core genes as created using MAFFT within Pirate. Loci are ordered according to the gene_families.ordered file. | +| pirate_core_alignment_gff | File | Annotation data for the gene family within the corresponding fasta file | +| pirate_core_snp_matrix | File | SNP distance matrix created from the core gene alignment | +| pirate_docker_image | String | Pirate docker image used | +| pirate_gene_families_ordered | File | Summary of all gene families, as estimated by Pirate | +| pirate_iqtree_core_tree | File | Phylogenetic tree produced by IQ-TREE from the core gene alignment | +| pirate_iqtree_pan_tree | File | Phylogenetic tree produced by IQ-TREE from the pangenome alignment | +| pirate_iqtree_version | String | IQ-TREE version used | +| pirate_pan_alignment_fasta | File | Nucleotide alignments of the pangenome by gene as created using MAFFT within Pirate. Loci are ordered according to the gene_families.ordered file. | +| pirate_pan_alignment_gff | File | Annotation data for the gene family within the corresponding fasta file | +| pirate_pan_snp_matrix | File | SNP distance matrix created from the pangenome alignment | +| pirate_pangenome_summary | File | Summary of the number and frequency of genes in the pangenome, as estimated by Pirate | +| pirate_presence_absence_csv | File | A file generated by Pirate that allows many post-alignment tools created for Roary to be used on the output from Pirate | +| pirate_snp_dists_version | String | Version of snp-dists used | +| pirate_summarized_data | File | The presence/absence matrix generated by the summarize_data task from the list of columns provided | + +## References + +>Sion C Bayliss, Harry A Thorpe, Nicola M Coyle, Samuel K Sheppard, Edward J Feil, PIRATE: A fast and scalable pangenomics toolbox for clustering diverged orthologues in bacteria, *GigaScience*, Volume 8, Issue 10, October 2019, giz119,  + +> Lam-Tung Nguyen, Heiko A. Schmidt, Arndt von Haeseler, Bui Quang Minh, IQ-TREE: A Fast and Effective Stochastic Algorithm for Estimating Maximum-Likelihood Phylogenies, *Molecular Biology and Evolution*, Volume 32, Issue 1, January 2015, Pages 268–274,  + +> diff --git a/docs/workflows/phylogenetic_construction/czgenepi_prep.md b/docs/workflows/phylogenetic_construction/czgenepi_prep.md new file mode 100644 index 000000000..1c2b88804 --- /dev/null +++ b/docs/workflows/phylogenetic_construction/czgenepi_prep.md @@ -0,0 +1,62 @@ +# CZGenEpi_Prep + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v1.3.0 | No | Set-level | + +## CZGenEpi_Prep_PHB + +The CZGenEpi_Prep workflow prepares data for upload to the Chan Zuckerberg GEN EPI platform, where phylogenetic trees and additional data processing can occur. This workflow extracts the necessary metadata fields from your Terra table. + +### Inputs + +In order to enable customization for where certain fields should be pulled from the Terra table, the user can specify different column names in the appropriate location. For example, if the user wants to use the "clearlabs_fasta" column for the assembly file _instead_ of the default "assembly_fasta" column, they can write "clearlabs_fasta" for the `assembly_fasta_column_name` optional variable. + +Variables with both the "Optional" and "Required" tag require the column (regardless of name) to be present in the data table. + +This workflow runs on the set level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| czgenepi_prep | **sample_names** | Array[String] | The array of sample ids you want to prepare for CZ GEN EPI | | Required | +| czgenepi_prep | **terra_table_name** | String | The name of the Terra table where the data is hosted | | Required | +| czgenepi_prep | **terra_project_name** | String | The name of the Terra project where the data is hosted | | Required | +| czgenepi_prep | **terra_workspace_name** | String | The name of the Terra workspace where the data is hosted | | Required | +| download_terra_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 10 | Optional | +| download_terra_table | **docker** | String | The Docker container to use for the task | quay.io/theiagen/terra-tools:2023-06-21 | Optional | +| download_terra_table | **disk_size** | String | The size of the disk used when running this task | 1 | Optional | +| download_terra_table | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| czgenepi_prep | **assembly_fasta_column_name** | String | The column name where the sample's assembly file can be found | assembly_fasta | Optional, Required | +| czgenepi_prep | **county_column_name** | String | The column name where the samples' originating county can be found | county | Optional, Required | +| czgenepi_prep | **organism** | String | The organism for data preparation. Options: "mpox" or "sars-cov-2" | sars-cov-2 | Optional | +| czgenepi_prep | **is_private** | Boolean | Sets whether sample status is provate, or not | true | Optional | +| czgenepi_prep | **genbank_accession_column_name** | String | The column name where the genbank accession for the sample can be found | genbank_accession | Optional | +| czgenepi_prep | **country_column_name** | String | The column name where the sample's originating country can be found | country | Optional, Required | +| czgenepi_prep | **collection_date_column_name** | String | The column name where the sample's collection date can be found | collection_date | Optional, Required | +| czgenepi_prep | **state_column_name** | String | The column name where the sample's originating state can be found | state | Optional, Required | +| czgenepi_prep | **continent_column_name** | String | The column name where the sample's originating continent can be found | continent | Optional, Required | +| czgenepi_prep | **sequencing_date_column_name** | String | The column name where the sample's sequencing data can be found | sequencing_date | Optional | +| czgenepi_prep | **private_id_column_name** | String | The column name where the Private ID for the sample can be found | terra_table_name_id | Optional, Required | +| czgenepi_wrangling | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| czgenepi_wrangling | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-08-2 | Optional | +| czgenepi_wrangling | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| czgenepi_wrangling | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +The concatenated_czgenepi_fasta and concatenated_czgenepi_metadata files can be uploaded directly to CZ GEN EPI without any adjustments. + +| **Variable** | **Type** | **Description** | +|---|---|---| +| concatenate_czgenepi_fasta | File | The concatenated fasta file with the renamed headers (the headers are renamed to account for clearlabs data which has unique headers) | +| concatenate_czgenepi_metadata | File | The concatenated metadata that was extracted from the terra table using the specified columns | +| czgenepi_prep_version | String | The version of PHB the workflow is in | +| czgenepi_prep_analysis_date | String | The date the workflow was run | + +## References + +> CZ GEN EPI Help Center "Uploading Data" diff --git a/docs/workflows/phylogenetic_construction/find_shared_variants.md b/docs/workflows/phylogenetic_construction/find_shared_variants.md new file mode 100644 index 000000000..49908c0e0 --- /dev/null +++ b/docs/workflows/phylogenetic_construction/find_shared_variants.md @@ -0,0 +1,89 @@ +# Find_Shared_Variants + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics) | PHB v2.0.0 | Yes | Set-level | + +## Find_Shared_Variants_PHB + +`Find_Shared_Variants_PHB` is a workflow for concatenating the variant results produced by the `Snippy_Variants_PHB` workflow across multiple samples and reshaping the data to illustrate variants that are shared among multiple samples. + +!!! caption "Find_Shared_Variants Workflow Diagram" + + ![Find_Shared_Variants Workflow Diagram](../../assets/figures/Find_Shared_Variants_PHB.png) + +### Inputs + +The primary intended input of the workflow is the `snippy_variants_results` output from `Snippy_Variants_PHB` or the `theiaeuk_snippy_variants_results` output of the TheiaEuk workflow. Variant results files from other tools may not be compatible at this time. + +All variant data included in the sample set should be generated from aligning sequencing reads to the **same reference genome**. If variant data was generated using different reference genomes, shared variants cannot be identified and results will be less useful. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +| --- | --- | --- | --- | --- | --- | +| shared_variants_wf | **concatenated_file_name** | String | String of your choice to prefix output files | | Required | +| shared_variants_wf | **samplenames** | Array[String] | The samples to be included in the analysis | | Required | +| shared_variants_wf | **variants_to_cat** | Array[File] | The result file from the Snippy_Variants workflow | | Required | +| cat_variants | **docker_image** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1" | Optional | +| shared_variants | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| shared_variants | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| shared_variants | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" | Optional | +| shared_variants | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Tasks + +??? task "Concatenate Variants" + + ##### Concatenate Variants Task {#concatenate_variants_task} + + The `cat_variants` task concatenates variant data from multiple samples into a single file `concatenated_variants`. It is very similar to the `cat_files` task, but also adds a column to the output file that indicates the sample associated with each row of data. + + The `concatenated_variants` file will be in the following format: + + | samplename | CHROM | POS | TYPE | REF | ALT | EVIDENCE | FTYPE | STRAND | NT_POS | AA_POS | EFFECT | LOCUS_TAG | GENE | PRODUCT | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | sample1 | PEKT02000007 | 5224 | snp | C | G | G:21 C:0 | | | | | | | | | + | sample2 | PEKT02000007 | 34112 | snp | C | G | G:32 C:0 | CDS | + | 153/1620 | 51/539 | missense_variant c.153C>G p.His51Gln | B9J08_002604 | hypothetical protein | | + | sample3 | PEKT02000007 | 34487 | snp | T | A | A:41 T:0 | CDS | + | 528/1620 | 176/539 | missense_variant c.528T>A p.Asn176Lys | B9J08_002604 | hypothetical protein | | + + !!! techdetails "Technical Details" + + | | Links | + | --- | --- | + | Task | /tasks/utilities/file_handling/task_cat_files.wdl | + | Software Source Code | [task_cat_files.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/file_handling/task_cat_files.wdl) | + +??? task "Shared Variants Task" + + ##### Shared Variants Task {#shared_variants_task} + + The `shared_variants` task takes in the `concatenated_variants` output from the `cat_variants` task and reshapes the data so that variants are rows and samples are columns. For each variant, samples where the variant was detected are populated with a "1" and samples were **either the variant was not detected or there was insufficient coverage to call variants** are populated with a "0". The resulting table is available as the `shared_variants_table` output. + + The `shared_variants_table` file will be in the following format: + + | CHROM | POS | TYPE | REF | ALT | FTYPE | STRAND | NT_POS | AA_POS | EFFECT | LOCUS_TAG | GENE | PRODUCT | sample1 | sample2 | sample3 | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | PEKT02000007 | 2693938 | snp | T | C | CDS | - | 1008/3000 | 336/999 | synonymous_variant c.1008A>G p.Lys336Lys | B9J08_003879 | NA | chitin synthase 1 | 1 | 1 | 0 | + | PEKT02000007 | 2529234 | snp | G | C | CDS | + | 282/336 | 94/111 | missense_variant c.282G>C p.Lys94Asn | B9J08_003804 | NA | cytochrome c | 1 | 1 | 1 | + | PEKT02000002 | 1043926 | snp | A | G | CDS | - | 542/1464 | 181/487 | missense_variant c.542T>C p.Ile181Thr | B9J08_000976 | NA | dihydrolipoyl dehydrogenase | 1 | 1 | 0 | + + !!! techdetails "Technical Details" + + | | Links | + | --- | --- | + | Task | task_shared_variants.wdl | + | Software Source Code | [task_shared_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_shared_variants.wdl) | + +### Outputs + +The outputs of this workflow are the `concatenated_variants` file and the `shared_variants_table` file. + +| **Variable** | **Type** | **Description** | +| --- | --- | --- | +| concatenated_variants | File | The concatenated variants without presence/absence | +| shared_variants_analysis_date | String | The date the workflow was run | +| shared_variants_table | File | The shared variants table listing presence/absence for each mutation identified in the samples | +| shared_variants_version | String | The version of PHB the workflow is in | diff --git a/docs/workflows/phylogenetic_construction/ksnp3.md b/docs/workflows/phylogenetic_construction/ksnp3.md new file mode 100644 index 000000000..e54a0c27b --- /dev/null +++ b/docs/workflows/phylogenetic_construction/ksnp3.md @@ -0,0 +1,116 @@ +# kSNP3 + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics), [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.1.0 | Yes; some optional features incompatible | Set-level | + +## kSNP3_PHB + +The kSNP3 workflow is for phylogenetic analysis of bacterial genomes using single nucleotide polymorphisms (SNPs). The kSNP3 workflow identifies SNPs amongst a set of genome assemblies, then calculates a number of phylogenetic trees based on those SNPs: + +- **Pan-genome phylogenetic trees:** The term "pan-genome" is used here to describe the collective genetic content amongst the set of genomes, including regions outside of genes and other coding sequences. Outputs based on the pan-genome are labeled with `_pan`. +- **Core-genome phylogenetic trees:** The kSNP3 workflow will also generate phylogenetic trees based on the core genome (genetic content that is present in all members of the set of genomes). Outputs based on the core-genome are labeled with `_core`. + +This workflow also features an optional module, `summarize_data` that creates a presence/absence matrix for the analyzed samples from a list of indicated columns (such as AMR genes, plasmid types etc.). If the `phandango_coloring` variable is set to `true`, this will be formatted for visualization in [Phandango](https://jameshadfield.github.io/phandango/#/), else it can be viewed in Excel. + +You can learn more about the kSNP3 workflow, including how to visualize the outputs with MicrobeTrace in the following video: **📺 [Using KSNP3 in Terra and Visualizing Bacterial Genomic Networks in MicrobeTrace](https://www.youtube.com/watch?v=iRpNDun46R8)** + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| ksnp3_workflow | **assembly_fasta** | Array[File] | The assembly files to be analyzed | | Required | +| ksnp3_workflow | **cluster_name** | String | Free text string used to label output files | | Required | +| ksnp3_workflow | **samplename** | Array[String] | The set of sample names | | Required | +| core_ksnp3_shared_snps_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| core_reorder_matrix | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| core_reorder_matrix | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| core_reorder_matrix | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | +| core_reorder_matrix | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| core_snp_dists | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| core_snp_dists | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| core_snp_dists | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2 | Optional | +| core_snp_dists | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| ksnp3_task | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| ksnp3_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| ksnp3_task | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ksnp3:3.1 | Optional | +| ksnp3_task | **kmer_size** | Int | The length of kmer containing the SNP you want kSNP3 to use | 19 | Optional | +| ksnp3_task | **ksnp3_args** | String | Additional arguments you want kSNP3 to use; e.g., "-ML" or "-NJ" | | Optional | +| ksnp3_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| ksnp3_task | **previous_ksnp3_snps** | File | File with existing SNPs for the current run to be appended to. | | Optional | +| ksnp3_workflow | **data_summary_column_names** | String | A comma-separated list of the column names from the sample-level data table for generating a data summary (presence/absence .csv matrix); e.g., "amrfinderplus_amr_genes,amrfinderplus_virulence_genes" | | Optional | +| ksnp3_workflow | **data_summary_terra_project** | String | The billing project for your current workspace. This can be found after the "#workspaces/" section in the workspace's URL | | Optional | +| ksnp3_workflow | **data_summary_terra_table** | String | The name of the sample-level Terra data table that will be used for generating a data summary | | Optional | +| ksnp3_workflow | **data_summary_terra_workspace** | String | The name of the Terra workspace you are in. This can be found at the top of the webpage, or in the URL after the billing project. | | Optional | +| ksnp3_workflow | **midpoint_root_tree** | Boolean | If true, midpoint root the final tree | FALSE | Optional | +| ksnp3_workflow | **phandango_coloring** | Boolean | Boolean variable that tells the data summary task and the reorder matrix task to include a suffix that enables consistent coloring on Phandango; by default, this suffix is not added. To add this suffix set this variable to true. | FALSE | Optional | +| pan_reorder_matrix | **cpu** | Int | Number of CPUs to allocate to the task | 100 | Optional | +| pan_reorder_matrix | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 2 | Optional | +| pan_reorder_matrix | **docker** | String | The Docker container to use for the task | 100 | Optional | +| pan_reorder_matrix | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | +| pan_snp_dists | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| pan_snp_dists | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| pan_snp_dists | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2 | Optional | +| pan_snp_dists | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| summarize_data | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| summarize_data | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| summarize_data | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | +| summarize_data | **id_column_name** | String | If the sample IDs are in a different column to samplenames, it can be passed here and it will be used instead. | | Optional | +| summarize_data | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Actions + +The `ksnp3` workflow is run on the set of assembly files to produce both pan-genome and core-genome phylogenies. This also results in alignment files which - are used by [`snp-dists`](https://github.com/tseemann/snp-dists) to produce a pairwise SNP distance matrix for both the pan-genome and core-genomes. + +If you fill out the `data_summary_*` and `sample_names` optional variables, you can use the optional `summarize_data` task. The task takes a comma-separated list of column names from the Terra data table, which should each contain a list of comma-separated items. For example, `"amrfinderplus_virulence_genes,amrfinderplus_stress_genes"` (with quotes, comma separated, no spaces) for these output columns from running TheiaProk. The task checks whether those comma-separated items are present in each row of the data table (sample), then creates a CSV file of these results. The CSV file indicates presence (TRUE) or absence (empty) for each item. By default, the task adds a Phandango coloring tag to group items from the same column, but you can turn this off by setting `phandango_coloring` to `false`. + +??? toggle "**Example output CSV**" + + ```text linenums="1" + Sample_Name,aph(3')-IIa,blaCTX-M-65,blaOXA-193,tet(O) + sample1,TRUE,,TRUE,TRUE + sample2,,,FALSE,TRUE + sample3,,,FALSE, + ``` + +??? toggle "**Example use of Phandango coloring**" + + Data summary produced using the `phandango_coloring` option, visualized alongside Newick tree at + + !!! caption "Example phandango_coloring output" + ![Phandango coloring example](../../assets/figures/example_phandango_coloring.png) + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| ksnp3_core_snp_matrix | File | The SNP matrix made with the core genome; formatted for Phandango if `phandango_coloring` input is `true` | +| ksnp3_core_snp_matrix_status | String | Will print either `The core SNP matrix was produced` OR `The core SNP matrix could not be produced` | +| ksnp3_core_snp_table | File | Formatted version of ksnp3_vcf_ref_genome file with only core SNPs, sorted by number of occurrences in the sample set | +| ksnp3_core_tree | File | The phylogenetic tree made with the core genome | +| ksnp3_docker | String | The docker image used | +| ksnp3_filtered_metadata | File | Optional output file with filtered metadata that is only produced if the optional `summarize_data` task is used. | +| ksnp3_ml_tree | File | Maximum likelihood tree that is only produced if `ksnp3_args` includes `"-ML"` | +| ksnp3_nj_tree | File | Neighbor joining tree that is only produced if `ksnp3_args` includes `"-NJ"` | +| ksnp3_number_core_snps | String | Number of core SNPs in the sample set | +| ksnp3_number_snps | String | Number of SNPs in the sample set | +| ksnp3_pan_snp_matrix | File | The SNP matrix made with the pangenome; formatted for Phandango if `phandango_coloring` input is `true` | +| ksnp3_pan_tree | File | The phylogenetic tree made with the pangenome | +| ksnp3_snp_dists_version | String | The version of snp_dists used in the workflow | +| ksnp3_snps | File | File containing the set of SNPs used in the analysis. Required if more trees are to be appended to the existing one. | +| ksnp3_summarized_data | File | CSV presence/absence matrix generated by the `summarize_data` task from the list of columns provided; formatted for Phandango if `phandango_coloring` input is `true` | +| ksnp3_vcf_ref_genome | File | A VCF file containing the variants detected in the core genome | +| ksnp3_vcf_ref_samplename | String | The name of the (user-supplied) sample used as the reference for calling SNPs. | +| ksnp3_vcf_snps_not_in_ref | File | A TSV file of the SNPs not present in the reference genome, but were identified by kSNP3. | +| ksnp3_wf_analysis_date | String | The date the workflow was run | +| ksnp3_wf_version | String | The version of the repository the workflow is hosted in | + +## References + +>Shea N Gardner, Tom Slezak, Barry G. Hall, kSNP3.0: SNP detection and phylogenetic analysis of genomes without genome alignment or reference genome, *Bioinformatics*, Volume 31, Issue 17, 1 September 2015, Pages 2877–2878,  + + diff --git a/docs/workflows/phylogenetic_construction/lyve_set.md b/docs/workflows/phylogenetic_construction/lyve_set.md new file mode 100644 index 000000000..4476cf02b --- /dev/null +++ b/docs/workflows/phylogenetic_construction/lyve_set.md @@ -0,0 +1,88 @@ +# Lyve_SET + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.1.0 | Yes | Set-level | + +## Lyve_SET_PHB + +The Lyve_SET WDL workflow runs the [Lyve-SET](https://github.com/lskatz/lyve-SET) pipeline developed by Lee Katz et al. for phylogenetic analysis of bacterial genomes using high quality single nucleotide polymorphisms (hqSNPs). The Lyve_SET workflow identifies SNPs amongst a set of samples by mapping sequencing reads to a reference genome, identifying high quality SNPs, and inferring phylogeny using RAxML. + +### Lyve-SET Pipeline (from [Lyve-SET paper](https://www.frontiersin.org/articles/10.3389/fmicb.2017.00375/full)) + +!!! caption "Lyve-SET Workflow Diagram" + ![Lyve-SET Workflow Diagram](../../assets/figures/Lyve_Set.png) + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| lyveset_workflow | **dataset_name** | String | Free text string used to label output files | | Required | +| lyveset_workflow | **read1** | Array[File] | Array of read1 files for sample set. We recommend using cleaned rather than raw reads. | | Required | +| lyveset_workflow | **read2** | Array[File] | Array of read2 files for sample set. We recommend using cleaned rather than raw reads. | | Required | +| lyveset_workflow | **reference_genome** | File | Path to reference genome in a Terra-accessible Google bucket. For considerations when choosing a reference genome, see: | | Required | +| lyveset | **allowedFlanking** | Int | Allowed flanking distance in base pairs. Nucleotides this close together cannot be considered as high-quality. | 0 | Optional | +| lyveset | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| lyveset | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| lyveset | **docker_image** | String | Docker image used for running Lyve-SET | "us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f" | Optional | +| lyveset | **downsample** | Boolean | If true, downsample all reads to 50x. Approximated according to the ref genome assembly | FALSE | Optional | +| lyveset | **fast** | Boolean | Shorthand for `--downsample --mapper snap --nomask-phages --nomask-cliffs --sample-sites` | FALSE | Optional | +| lyveset | **mapper** | String | Which mapper? Choices: "smalt", "snap" | "smalt" | Optional | +| lyveset | **mask_cliffs** | Boolean | If true, search for and mask 'Cliffs' in pileups | FALSE | Optional | +| lyveset | **mask_phages** | Boolean | If true, search for and mask phages in the reference genome | FALSE | Optional | +| lyveset | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| lyveset | **min_alt_frac** | Float | The percent consensus that needs to be reached before a SNP is called. Otherwise, 'N' | 0.75 | Optional | +| lyveset | **min_coverage** | Int | Minimum coverage needed before a SNP is called. Otherwise, 'N' | 10 | Optional | +| lyveset | **nomatrix** | Boolean | If true, do not create an hqSNP matrix | FALSE | Optional | +| lyveset | **nomsa** | Boolean | If true, do not make a multiple sequence alignment | FALSE | Optional | +| lyveset | **notrees** | Boolean | If true, do not make phylogenies | FALSE | Optional | +| lyveset | **presets** | String | See [presets.conf](https://github.com/lskatz/lyve-SET/blob/v1.1.4-head/config/original/presets.conf) for more information | | Optional | +| lyveset | **read_cleaner** | String | Which read cleaner? Choices: "none", "CGP", "BayesHammer" | "CGP" | Optional | +| lyveset | **sample_sites** | Boolean | If true, randomly choose a genome and find SNPs in a quick and dirty way. Then on the SNP-calling stage, only interrogate those sites for SNPs for each genome (including the randomly-sampled genome). | FALSE | Optional | +| lyveset | **snpcaller** | String | Which SNP caller? Choices: "varscan", "vcftools" | "varscan" | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Actions + +The Lyve_SET WDL workflow is run using read data from a set of samples. The workflow will produce a pairwise SNP matrix for the sample set and a maximum likelihood phylogenetic tree. Details regarding the default implementation of Lyve_SET and optional modifications are listed below. + +1. Read processing + 1. By default, the Lyve_SET WDL workflow will perform read cleaning using the CG-Pipeline "CGP". However, read cleaning can be turned off or performed using "BayesHammer" using the `read_cleaner` input variable. +2. Reference procurement + 1. By default, the Lyve_SET WDL workflow will **not** mask phages or cliffs in the reference genome. Cliffs refer to regions of the reference genome where read coverage rises or falls abruptly. Masking phages and cliffs is intended to remove low quality SNPs. Users can invoke phage and cliff masking by setting the `mask_cliffs` and `mask_phages` variables to "true". +3. SNP discovery + 1. The Lyve_SET WDL workflow uses the default read mapper and variant caller from the Lyve-SET pipeline (`smalt` and `varscan`). Additional options for each are available using the `mapper` and `snpcaller` input variables. + 2. The workflow also uses the default parameters for variant calling from the Lyve-SET pipeline: the minimum percent consensus to call a base is 0.75 and minimum read depth is 10X. These parameters can be manually modified using the `min_alt_frac` and `min_coverage` input variables. +4. Phylogenetic analysis + 1. The Lyve_SET workflow will attempt to produce a multiple sequence alignment, SNP distance matrix, and phylogenetic tree. These actions can be skipped by indicating `nomsa` = true, `nomatrix` = true, or `notrees` = true, respectively. + +### Outputs + +For full descriptions of Lyve-SET pipeline outputs, we recommend consulting the Lyve-SET documentation: + +The following output files are populated to the Terra data table. However, please note that certain files may not appear in the data table following a run for two main reasons: + +1. The user instructed the workflow to skip an analysis step + 1. For example, if `notrees` = true, no tree file will appear +2. The workflow skipped an analysis step due to an issue with the input data + 1. For example, the workflow will not attempt to produce a phylogenetic tree if there are too few samples or if samples are too closely related + +| **Variable** | **Type** | **Description** | **Equivalent file from Lyve-SET pipeline** | +| --- | --- | --- | --- | +| lyveset_alignment_fasta | File | The output alignment file in fasta format | project/msa/out.aln.fasta | +| lyveset_docker_image | String | Lyve_SET docker image used for analysis | | +| lyveset_log | File | Lyve_SET task log file | | +| lyveset_pairwise_matrix | File | Pairwise SNP distances matrix | project/msa/out.pairwiseMatrix.tsv | +| lyveset_pooled_snps_vcf | File | SNPs vcf | project/msa/out.pooled.snps.vcf.gz | +| lyveset_raxml_tree | File | RAxML-generated tree in newick format | project/msa/out.RAxML_bipartitions | +| lyveset_wf_analysis_date | String | Date analysis was run | | +| lyveset_wf_version | String | Version of PHB used when running Lyveset_PHB | | + +In addition to these outputs, all of the files produced by the Lyve-SET pipeline are available in the task-level outputs, including intermediate files and individual bam and vcf files for each sample. These files can be accessed viewing the execution directory for the run. + +## References + +> **Lyve-SET** Katz LS, Griswold T, Williams-Newkirk AJ, Wagner D, Petkau A, et al. (2017) A Comparative Analysis of the Lyve-SET Phylogenomics Pipeline for Genomic Epidemiology of Foodborne Pathogens. Frontiers in Microbiology 8. diff --git a/docs/workflows/phylogenetic_construction/mashtree_fasta.md b/docs/workflows/phylogenetic_construction/mashtree_fasta.md new file mode 100644 index 000000000..53ad8f24d --- /dev/null +++ b/docs/workflows/phylogenetic_construction/mashtree_fasta.md @@ -0,0 +1,80 @@ +# MashTree_FASTA + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.1.0 | Yes | Set-level | + +## MashTree_FASTA_PHB + +`MashTree_FASTA` creates a phylogenetic tree using Mash distances. + +Mash distances are representations of how many kmers two sequences have in common. These distances are generated by transforming all kmers from a sequence into an integer value with hashing and Bloom filters. The hashed kmers are sorted and a "sketch" is created by only using the kmers that appear at the top of the sorted list. These sketches can be compared by counting the number of hashed kmers they have in common. Mashtree uses a neighbor-joining algorithm to cluster these "distances" into phylogenetic trees. + +This workflow also features an optional module, `summarize_data`, that creates a presence/absence matrix for the analyzed samples from a list of indicated columns (such as AMR genes, etc.) that can be used in Phandango. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| mashtree_fasta | **assembly_fasta** | Array[File] | The set of assembly fastas | | Required | +| mashtree_fasta | **cluster_name** | String | Free text string used to label output files | | Required | +| mashtree_fasta | **data_summary_column_names** | String | A comma-separated list of the column names from the sample-level data table for generating a data summary (presence/absence .csv matrix); e.g., "amrfinderplus_amr_genes,amrfinderplus_virulence_genes" | | Optional | +| mashtree_fasta | **data_summary_terra_project** | String | The billing project for your current workspace. This can be found after the "#workspaces/" section in the workspace's URL | | Optional | +| mashtree_fasta | **data_summary_terra_table** | String | The name of the sample-level Terra data table that will be used for generating a data summary | | Optional | +| mashtree_fasta | **data_summary_terra_workspace** | String | The name of the Terra workspace you are in. This can be found at the top of the webpage, or in the URL after the billing project. | | Optional | +| mashtree_fasta | **midpoint_root_tree** | Boolean | If true, midpoint root the final tree | FALSE | Optional | +| mashtree_fasta | **phandango_coloring** | Boolean | Boolean variable that tells the data summary task and the reorder matrix task to include a suffix that enables consistent coloring on Phandango; by default, this suffix is not added. To add this suffix set this variable to true. | FALSE | Optional | +| mashtree_fasta | **sample_names** | Array[String] | The list of samples | | Optional | +| mashtree_task | **cpu** | Int | Number of CPUs to allocate to the task | 16 | Optional | +| mashtree_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| mashtree_task | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/mashtree:1.2.0" | Optional | +| mashtree_task | **genomesize** | Int | Genome size of the input samples | 5000000 | Ooptional | +| mashtree_task | **kmerlength** | Int | Hashes will be based on strings of this many nucleotides | 21 | Optional | +| mashtree_task | **mindepth** | Int | If set to zero, mashtree will run in "accurate" mode as it will chose a mindepth by itself in a slower method; this value otherwise indicates the minimum number of times a kmer must appear in order to be included | 5 | Optional | +| mashtree_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 64 | Optional | +| mashtree_task | **sketchsize** | Int | Each sketch will have at most this many non-redundant min-hashes | 10000 | Optional | +| mashtree_task | **sort_order** | String | For neighbor-joining, the sort order can make a difference. Options include: "ABC" (alphabetical), "random", "input-order" | "ABC" | Optional | +| mashtree_task | **truncLength** | Int | How many characters to keep in a filename | 250 | Optional | +| reorder_matrix | **cpu** | Int | Number of CPUs to allocate to the task | 100 | Optional | +| reorder_matrix | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 2 | Optional | +| reorder_matrix | **docker** | String | The Docker container to use for the task | 100 | Optional | +| reorder_matrix | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | +| summarize_data | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| summarize_data | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| summarize_data | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | +| summarize_data | **id_column_name** | String | If the sample IDs are in a different column to samplenames, it can be passed here and it will be used instead. | | Optional | +| summarize_data | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Actions + +`MashTree_Fasta` is run on a set of assembly fastas and creates a phylogenetic tree and matrix. These outputs are passed to a task that will rearrange the matrix to match the order of the terminal ends in the phylogenetic tree. + +The optional `summarize_data` task performs the following only if all of the `data_summary_*` and `sample_names` optional variables are filled out: + +1. Digests a _comma-separated_ list of column names, such as `"amrfinderplus_virulence_genes,amrfinderplus_stress_genes"`, etc. that can be found within the origin Terra data table. +2. It will then parse through those column contents and extract each value; for example, if the `amrfinder_amr_genes` column for a sample contains these values: `"aph(3')-IIIa,tet(O),blaOXA-193"`, the `summarize_data` task will check each sample in the set to see if they also have those AMR genes detected. +3. Outputs a .csv file that indicates presence (TRUE) or absence (empty) for each item in those columns; that is, it will check each sample in the set against the detected items in each column to see if that value was also detected. + +By default, this task appends a Phandango coloring tag to color all items from the same column the same; this can be turned off by setting the optional `phandango_coloring` variable to `false`. + +### Outputs + +| **Variable** | **Type** | **Description** | +| mashtree_docker | String | The Docker image used to run the mashtree task | +| mashtree_filtered_metadata | File | Optional output file with filtered metadata that is only produced if the optional `summarize_data` task is used | +| mashtree_matrix | File | The SNP matrix made | +| mashtree_summarized_data | File | CSV presence/absence matrix generated by the `summarize_data` task from the list of columns provided; formatted for Phandango if `phandango_coloring` input is `true` | +| mashtree_tree | File | The phylogenetic tree made | +| mashtree_version | String | The version of mashtree used in the workflow | +| mashtree_wf_analysis_date | String | The date the workflow was run | +| mashtree_wf_version | String | The version of PHB the workflow is hosted in | + +## References + +> Katz, L. S., Griswold, T., Morrison, S., Caravas, J., Zhang, S., den Bakker, H.C., Deng, X., and Carleton, H. A., (2019). Mashtree: a rapid comparison of whole genome sequence files. Journal of Open Source Software, 4(44), 1762,  + +>Ondov, B. D., Treangen, T. J., Melsted, P., Mallonee, A. B., Bergman, N. H., Koren, S., & Phillippy, A. M. (2016). Mash: Fast genome and metagenome distance estimation using minhash. Genome Biology, 17(1), 132. doi:10.1186/s13059-016-0997-x diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline.md b/docs/workflows/phylogenetic_construction/snippy_streamline.md new file mode 100644 index 000000000..7c2fa47df --- /dev/null +++ b/docs/workflows/phylogenetic_construction/snippy_streamline.md @@ -0,0 +1,223 @@ +# Snippy_Streamline + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.2.0 | Yes; some optional features incompatible | Set-level | + +## Snippy_Streamline_PHB + +!!! caption "Snippy_Streamline_PHB Workflow Diagram" +
+ ![Snippy_Streamline_PHB Workflow Diagram](../../assets/figures/Snippy_Streamline.png){width=50%} +
+ +The `Snippy_Streamline` workflow is an all-in-one approach to generating a reference-based phylogenetic tree and associated SNP-distance matrix. The workflow can be run in multiple ways with options for: + +- The reference genome to be provided by the user, or automatically selected using the `Centroid` task and `Assembly_Fetch` sub-workflow to find a close reference genome to your dataset + - Note: If no reference genome is provided, then the user MUST fill in the `assembly_fasta` field for automatic reference genome selection. +- The phylogeny to be generated by optionally + - masking user-specified regions of the genome (providing a bed file to `snippy_core_bed`) + - producing either a core or pan-genome phylogeny and SNP-matrix (`core_genome`; default = true) + - masking recombination detected by gubbins, or not (`use_gubbins`; default=true) + - choosing the nucleotide substitution (by specifying `iqtree2_model`), or allowing IQ-Tree's ModelFinder to identify the best model for your dataset (default) + +!!! info "Sequencing Data Requirements" + + **Sequencing data used in the Snippy_Streamline workflow must:** + + - Be Illumina reads + - Be generated by unbiased whole genome shotgun sequencing + - Pass appropriate QC thresholds for the taxa to ensure that the reads represent reasonably complete genomes that are free of contamination from other taxa or cross-contamination of the same taxon. + - If masking recombination with `Gubbins`, input data should represent complete genomes from the same strain/lineage (e.g. MLST) that share a recent common ancestor. + +!!! warning "Reference Genomes" + + **If reference genomes have multiple contigs, they will not be compatible with using Gubbins** to mask recombination in the phylogenetic tree. The automatic selection of a reference genome by the workflow may result in a reference with multiple contigs. In this case, an alternative reference genome should be sought. + +### Inputs + +To run Snippy_Streamline, either a reference genome must be provided (`reference_genome_file`), or you must provide assemblies of the samples in your tree so that the workflow can automatically find and download the closest reference genome to your dataset (via `assembly_fasta`) + +!!! tip "Guidance for optional inputs" + + Several core and optional tasks can be used to generate the Snippy phylogenetic tree, making it highly flexible and suited to a wide range of datasets. You will need to decide which tasks to use depending on the genomes that you are analyzing. Some guidelines for the optional tasks to use for different genome types are provided below. + + ??? toggle "Default settings (suitable for most bacteria)" + + The default settings are as follows and are suitable for generating phylogenies for most bacteria + + - `core_genome` = true (creates core genome phylogeny) + - `use_gubbins` = true (recombination masked) + - nucleotide substitution model will be defined by IQTree's Model Finder + + ??? toggle "Phylogenies of _Mycobacterium tuberculosis_ complex" + + Phylogenies of MTBC are typically constructed + + - Using the H37Rv reference genome + - `reference_genome_file` = gs://theiagen-public-files-rp/terra/theiaprok-files/Mtb_NC_000962.3.fasta + - Masking repetitive regions of the genome (e.g. PE/PPE genes) that are often misaligned + - `snippy_core_bed` = gs://theiagen-public-files/terra/theiaprok-files/Mtb_NC_000962.3.bed + - Without masking recombination because TB can be considered non-recombinant + - `use_gubbins` = false + - Using the core genome + - `core_genome` = true (as default) + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| snippy_streamline | **read1** | Array[File] | The forward read files | | Required | +| snippy_streamline | **read2** | Array[File] | The reverse read files | | Required | +| snippy_streamline | **samplenames** | Array[String] | The names of your samples | | Required | +| snippy_streamline | **tree_name** | String | String of your choice to prefix output files | | Required | +| centroid | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| centroid | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| centroid | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/centroid:0.1.0 | Optional | +| centroid | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| snippy_streamline | **assembly_fasta** | Array[File] | The assembly files for your samples | | Optional | +| snippy_streamline | **reference_genome_file** | File | Reference genome in FASTA or GENBANK format (must be the same reference used in Snippy_Variants workflow); provide this if you want to skip the detection of a suitable reference | | Optional | +| ncbi_datasets_download_genome_accession | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| ncbi_datasets_download_genome_accession | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| ncbi_datasets_download_genome_accession | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ncbi-datasets:14.13.2 | Optional | +| ncbi_datasets_download_genome_accession | **include_gbff3** | Boolean | When set to true, outputs a gbff3 file (Genbank file) | FALSE | Optional | +| ncbi_datasets_download_genome_accession | **include_gff** | Boolean | When set to true, outputs a gff file (Annotation file) | FALSE | Optional | +| ncbi_datasets_download_genome_accession | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| referenceseeker | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| referenceseeker | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 200 | Optional | +| referenceseeker | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/referenceseeker:1.8.0--pyhdfd78af_0 | Optional | +| referenceseeker | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| referenceseeker | **referenceseeker_ani_threshold** | Float | Bidirectional average nucleotide identity to use as a cut off for identifying reference assemblies with ReferenceSeeker; default value set according to | 0.95 | Optional | +| referenceseeker | **referenceseeker_conserved_dna_threshold** | Float | Conserved DNA % to use as a cut off for identifying reference assemblies with ReferenceSeeker; default value set according to | 0.69 | Optional | +| referenceseeker | **referenceseeker_db** | File | Database to use with ReferenceSeeker | gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-bacteria-refseq-205.v20210406.tar.gz | Optional | +| snippy_tree_wf | **call_shared_variants** | Boolean | Activates the shared variants analysis task | TRUE | Optional | +| snippy_tree_wf | **core_genome** | Boolean | When "true", workflow generates core genome phylogeny; when "false", whole genome is used | TRUE | Optional | +| snippy_tree_wf | **data_summary_column_names** | String | A comma-separated list of the column names from the sample-level data table for generating a data summary (presence/absence .csv matrix) | | Optional | +| snippy_tree_wf | **data_summary_terra_project** | String | The billing project for your current workspace. This can be found after the "#workspaces/" section in the workspace's URL | | Optional | +| snippy_tree_wf | **data_summary_terra_table** | String | The name of the sample-level Terra data table that will be used for generating a data summary | | Optional | +| snippy_tree_wf | **data_summary_terra_workspace** | String | The name of the Terra workspace you are in. This can be found at the top of the webpage, or in the URL after the billing project. | | Optional | +| snippy_tree_wf | **gubbins_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_tree_wf | **gubbins_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **gubbins_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/gubbins:3.3--py310pl5321h8472f5a_0 | Optional | +| snippy_tree_wf | **gubbins_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| snippy_tree_wf | **iqtree2_bootstraps** | String | Number of replicates for (Minimum recommended= 1000) | 1000 | Optional | +| snippy_tree_wf | **iqtree2_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_tree_wf | **iqtree2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **iqtree2_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/iqtree2:2.1.2 | Optional | +| snippy_tree_wf | **iqtree2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| snippy_tree_wf | **iqtree2_model** | String | Nucelotide substitution model to use when generating the final tree with IQTree2. By default, IQtree runs its ModelFinder algorithm to identify the model it thinks best fits your dataset | | Optional | +| snippy_tree_wf | **iqtree2_opts** | String | Additional options to pass to IQTree2 | | Optional | +| snippy_tree_wf | **midpoint_root_tree** | Boolean | A True/False option that determines whether the tree used in the SNP matrix re-ordering task should be re-rooted or not. Options: true of false | TRUE | Optional | +| snippy_tree_wf | **phandango_coloring** | Boolean | Boolean variable that tells the data summary task and the reorder matrix task to include a suffix that enables consistent coloring on Phandango; by default, this suffix is not added. To add this suffix set this variable to true. | FALSE | Optional | +| snippy_tree_wf | **snippy_core_bed** | File | User-provided bed file to mask out regions of the genome when creating multiple sequence alignments | | Optional | +| snippy_tree_wf | **snippy_core_cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| snippy_tree_wf | **snippy_core_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **snippy_core_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snippy:4.6.0 | Optional | +| snippy_tree_wf | **snippy_core_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| snippy_tree_wf | **snp_dists_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2 | Optional | +| snippy_tree_wf | **snp_sites_cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| snippy_tree_wf | **snp_sites_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **snp_sites_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-sites:2.5.1 | Optional | +| snippy_tree_wf | **snp_sites_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| snippy_tree_wf | **use_gubbins** | Boolean | When "true", workflow removes recombination with gubbins tasks; when "false", gubbins is not used | TRUE | Optional | +| snippy_variants_wf | **base_quality** | Int | Minimum quality for a nucleotide to be used in variant calling | 13 | Optional | +| snippy_variants_wf | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_variants_wf | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snippy:4.6.0 | Optional | +| snippy_variants_wf | **map_qual** | Int | Minimum mapping quality to accept in variant calling | | Optional | +| snippy_variants_wf | **maxsoft** | Int | Number of bases of alignment to soft-clip before discarding the alignment | | Optional | +| snippy_variants_wf | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| snippy_variants_wf | **min_coverage** | Int | Minimum read coverage of a position to identify a mutation | 10 | Optional | +| snippy_variants_wf | **min_frac** | Float | Minimum fraction of bases at a given position to identify a mutation | 0.9 | Optional | +| snippy_variants_wf | **min_quality** | Int | Minimum VCF variant call "quality" | 100 | Optional | +| snippy_variants_wf | **query_gene** | String | Indicate a particular gene of interest | | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Tasks + +For automatic reference selection by the workflow (optional): + +??? task "Centroid (optional)" + + ##### Centroid {#centroid} + + Centroid selects the most central genome among a list of assemblies by computing pairwise mash distances. In `Snippy_Streamline`, this centroid assembly is then used to find a closely related reference genome that can be used to generate the tree. In order to use `Centroid`, should complete the `samplenames` input. + + !!! techdetails "Centroid Technical Details" + + | | Links | + | --- | --- | + | Task | [task_centroid.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/task_centroid.wdl) | + | Software Source Code | | + | Software Documentation | | + +??? task "Assembly_Fetch workflow (optional)" + + ##### Assembly_Fetch {#assembly_fetch} + + The `Assembly_Fetch` workflow compares the centroid assembly with the RefSeq database to identify the closest reference and then downloads this assembly in FASTA format, and optionally also in GFF3 and/or GBFF format. The Reference database is for bacteria by default but this can be changed by adjusting the `referenceseeker_db` input to the appropriate database. See the [Assembly_Fetch](../data_import/assembly_fetch.md) workflow documentation for more information. + + !!! info "Call-Caching Disabled" + + If using Snippy_Streamline workflow (which runs the Assembly_Fetch workflow if no reference genome is provided by user) version 1.3.0 or higher, the call-caching feature of Terra has been DISABLED to ensure that the workflow is run from the beginning and data is downloaded fresh. Call-caching will not be enabled, even if the user checks the box ✅ in the Terra workflow interface. + +For all cases: + +??? task "Snippy_Variants workflow" + + ##### Snippy_Variants {#snippy_variants} + + `Snippy_Variants` aligns reads for each sample against the reference genome. As part of `Snippy_Streamline`, the only output from this workflow is the `snippy_variants_outdir_tarball` which is provided in the set-level data table. Please see the full documentation for [Snippy_Variants](./snippy_variants.md) for more information. + +??? task "Snippy_Tree workflow" + + ##### Snippy_Tree {#snippy_tree} + + A simplified version of `Snippy_Tree` is used to build the phylogeny in the `Snippy_Streamline` workflow. The tasks undertaken are exactly the same between both workflows, but the user inputs and outputs have been reduced for clarity and ease. Please see the full documentation for [Snippy_Tree](./snippy_tree.md) for more information. + + In Snippy Streamline, the nucleotide substitution model used by gubbins will always be GTR+GAMMA. + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| snippy_centroid_docker | String | Docker file used for Centroid | +| snippy_centroid_fasta | File | FASTA file for the centroid sample | +| snippy_centroid_mash_tsv | File | TSV file containing mash distances computed by centroid | +| snippy_centroid_samplename | String | Name of the centroid sample | +| snippy_centroid_version | String | Centroid version used | +| snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | +| snippy_concatenated_variants | File | The concatenated variants file | +| snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | +| snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | +| snippy_final_tree | File | Final phylogenetic tree produced by Snippy_Streamline | +| snippy_gubbins_branch_stats | File | CSV file showing for each branch of the tree | +| snippy_gubbins_docker | String | Docker file used for Gubbins | +| snippy_gubbins_recombination_gff | File | Recombination statistics in GFF format; these can be viewed in Phandango against the phylogenetic tree | +| snippy_gubbins_version | String | Gubbins version used | +| snippy_iqtree2_docker | String | Docker file used for IQTree2 | +| snippy_iqtree2_model_used | String | Nucleotide substitution model used by IQTree2 | +| snippy_iqtree2_version | String | IQTree2 version used | +| snippy_msa_snps_summary | File | CSV file showing s for each branch of the tree | +| snippy_ncbi_datasets_docker | String | Docker file used for NCBI datasets | +| snippy_ncbi_datasets_version | String | NCBI datasets version used | +| snippy_ref | File | Reference genome used by Snippy | +| snippy_ref_metadata_json | File | Metadata associated with the refence genome used by Snippy, in JSON format | +| snippy_referenceseeker_database | String | ReferenceSeeker database used | +| snippy_referenceseeker_docker | String | Docker file used for ReferenceSeeker | +| snippy_referenceseeker_top_hit_ncbi_accession | String | NCBI Accession for the top it identified by Assembly_Fetch | +| snippy_referenceseeker_tsv | File | TSV file of the top hits between the query genome and the Reference Seeker database | +| snippy_referenceseeker_version | String | ReferenceSeeker version used | +| snippy_snp_dists_docker | String | Docker file used for SNP-dists | +| snippy_snp_dists_version | String | SNP-dists version used | +| snippy_snp_sites_docker | String | Docker file used for SNP-sites | +| snippy_snp_sites_version | String | SNP-sites version used | +| snippy_streamline_analysis_date | String | Date of workflow run | +| snippy_streamline_version | String | Version of Snippy_Streamline used | +| snippy_summarized_data | File | CSV presence/absence matrix generated by the summarize_data task (within Snippy_Tree workflow) from the list of columns provided | +| snippy_tree_snippy_docker | String | Docker file used for Snippy in the Snippy_Tree subworkfow | +| snippy_tree_snippy_version | String | Version of Snippy_Tree subworkflow used | +| snippy_variants_outdir_tarball | Array[File] | A compressed file containing the whole directory of snippy output files. This is used when running Snippy_Tree | +| snippy_variants_snippy_docker | Array[String] | Docker file used for Snippy in the Snippy_Variants subworkfow | +| snippy_variants_snippy_version | Array[String] | Version of Snippy_Tree subworkflow used | +| snippy_wg_snp_matrix | File | CSV file of whole genome pairwise SNP distances between samples, calculated from the final alignment | diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md new file mode 100644 index 000000000..11f482891 --- /dev/null +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -0,0 +1,153 @@ +# Snippy_Streamline_FASTA + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.2.0 | Yes; some optional features incompatible | Set-level | + +## Snippy_Streamline_FASTA_PHB + +This workflow is a FASTA-compatible version of [Snippy_Streamline](./snippy_streamline.md). Please see the [Snippy_Streamline](./snippy_streamline.md) documentation for more information regarding the workflow tasks. + +!!! caption "Snippy_Streamline_FASTA_PHB Workflow Diagram" +
+ ![Snippy_Streamline_FASTA_PHB Workflow Diagram](../../assets/figures/Snippy_Streamline_FASTA.png){width=50%} +
+ +The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a reference-based phylogenetic tree and associated SNP-distance matrix. The workflow can be run in multiple ways with options for: + +- The reference genome to be provided by the user, or automatically selected using the `Centroid` task and `Assembly_Fetch` sub-workflow to find a close reference genome to your dataset +- The phylogeny to be generated by optionally + - masking user-specified regions of the genome (providing a bed file to `snippy_core_bed`) + - producing either a core or pan-genome phylogeny and SNP-matrix (`core_genome`; default = true) + - masking recombination detected by gubbins, or not (`use_gubbins`; default=true) + - choosing the nucleotide substitution (by specifying `iqtree2_model`), or allowing IQ-Tree's ModelFinder to identify the best model for your dataset (default) + +!!! info "Assembly Data Requirements" + + Input data used in the Snippy_Streamline_FASTA workflow must: + + - Be assembled genomes in FASTA format + - Be generated by unbiased whole genome shotgun sequencing + - Pass appropriate QC thresholds for the taxa to ensure that the assemblies represent reasonably complete genomes that are free of contamination from other taxa or cross-contamination of the same taxon. + - If masking recombination with `Gubbins`, input data should represent complete genomes from the same strain/lineage (e.g. MLST) that share a recent common ancestor. + +!!! warning "Reference Genomes" + + **If reference genomes have multiple contigs, they will not be compatible with using Gubbins** to mask recombination in the phylogenetic tree. The automatic selection of a reference genome by the workflow may result in a reference with multiple contigs. In this case, an alternative reference genome should be sought. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| snippy_streamline_fasta | **assembly_fasta** | Array[File] | The assembly files for your samples | | Required | +| snippy_streamline_fasta | **samplenames** | Array[String] | The names of your samples | | Required | +| snippy_streamline_fasta | **tree_name** | String | String of your choice to prefix output files | | Required | +| snippy_streamline_fasta | **reference_genome_file** | File | Reference genome in FASTA or GENBANK format (must be the same reference used in Snippy_Variants workflow); provide this if you want to skip the detection of a suitable reference | | Optional | +| centroid | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| centroid | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| centroid | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/centroid:0.1.0 | Optional | +| centroid | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| ncbi_datasets_download_genome_accession | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| ncbi_datasets_download_genome_accession | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| ncbi_datasets_download_genome_accession | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ncbi-datasets:14.13.2 | Optional | +| ncbi_datasets_download_genome_accession | **include_gbff3** | Boolean | When set to true, outputs a gbff3 file (Genbank file) | FALSE | Optional | +| ncbi_datasets_download_genome_accession | **include_gff** | Boolean | When set to true, outputs a gff file (Annotation file) | FALSE | Optional | +| ncbi_datasets_download_genome_accession | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| referenceseeker | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| referenceseeker | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 200 | Optional | +| referenceseeker | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/referenceseeker:1.8.0--pyhdfd78af_0 | Optional | +| referenceseeker | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| referenceseeker | **referenceseeker_ani_threshold** | Float | Bidirectional average nucleotide identity to use as a cut off for identifying reference assemblies with ReferenceSeeker; default value set according to | 0.95 | Optional | +| referenceseeker | **referenceseeker_conserved_dna_threshold** | Float | Conserved DNA % to use as a cut off for identifying reference assemblies with ReferenceSeeker; default value set according to | 0.69 | Optional | +| referenceseeker | **referenceseeker_db** | File | Database to use with ReferenceSeeker | gs://theiagen-public-files-rp/terra/theiaprok-files/referenceseeker-bacteria-refseq-205.v20210406.tar.gz | Optional | +| snippy_tree_wf | **call_shared_variants** | Boolean | Activates the shared variants analysis task | TRUE | Optional | +| snippy_tree_wf | **core_genome** | Boolean | When "true", workflow generates core genome phylogeny; when "false", whole genome is used | TRUE | Optional | +| snippy_tree_wf | **data_summary_column_names** | String | A comma-separated list of the column names from the sample-level data table for generating a data summary (presence/absence .csv matrix) | | Optional | +| snippy_tree_wf | **data_summary_terra_project** | String | The billing project for your current workspace. This can be found after the "#workspaces/" section in the workspace's URL | | Optional | +| snippy_tree_wf | **data_summary_terra_table** | String | The name of the sample-level Terra data table that will be used for generating a data summary | | Optional | +| snippy_tree_wf | **data_summary_terra_workspace** | String | The name of the Terra workspace you are in. This can be found at the top of the webpage, or in the URL after the billing project. | | Optional | +| snippy_tree_wf | **gubbins_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_tree_wf | **gubbins_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **gubbins_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/gubbins:3.3--py310pl5321h8472f5a_0 | Optional | +| snippy_tree_wf | **gubbins_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| snippy_tree_wf | **iqtree2_bootstraps** | String | Number of replicates for (Minimum recommended= 1000) | 1000 | Optional | +| snippy_tree_wf | **iqtree2_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_tree_wf | **iqtree2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **iqtree2_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/iqtree2:2.1.2 | Optional | +| snippy_tree_wf | **iqtree2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| snippy_tree_wf | **iqtree2_model** | String | Nucelotide substitution model to use when generating the final tree with IQTree2. By default, IQtree runs its ModelFinder algorithm to identify the model it thinks best fits your dataset | | Optional | +| snippy_tree_wf | **iqtree2_opts** | String | Additional options to pass to IQTree2 | | Optional | +| snippy_tree_wf | **midpoint_root_tree** | Boolean | A True/False option that determines whether the tree used in the SNP matrix re-ordering task should be re-rooted or not. Options: true of false | TRUE | Optional | +| snippy_tree_wf | **phandango_coloring** | Boolean | Boolean variable that tells the data summary task and the reorder matrix task to include a suffix that enables consistent coloring on Phandango; by default, this suffix is not added. To add this suffix set this variable to true. | FALSE | Optional | +| snippy_tree_wf | **snippy_core_bed** | File | User-provided bed file to mask out regions of the genome when creating multiple sequence alignments | | Optional | +| snippy_tree_wf | **snippy_core_cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| snippy_tree_wf | **snippy_core_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **snippy_core_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snippy:4.6.0 | Optional | +| snippy_tree_wf | **snippy_core_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| snippy_tree_wf | **snp_dists_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2 | Optional | +| snippy_tree_wf | **snp_sites_cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| snippy_tree_wf | **snp_sites_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **snp_sites_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-sites:2.5.1 | Optional | +| snippy_tree_wf | **snp_sites_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| snippy_tree_wf | **use_gubbins** | Boolean | When "true", workflow removes recombination with gubbins tasks; when "false", gubbins is not used | TRUE | Optional | +| snippy_variants_wf | **base_quality** | Int | Minimum quality for a nucleotide to be used in variant calling | 13 | Optional | +| snippy_variants_wf | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_variants_wf | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snippy:4.6.0 | Optional | +| snippy_variants_wf | **map_qual** | Int | Minimum mapping quality to accept in variant calling | | Optional | +| snippy_variants_wf | **maxsoft** | Int | Number of bases of alignment to soft-clip before discarding the alignment | | Optional | +| snippy_variants_wf | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| snippy_variants_wf | **min_coverage** | Int | Minimum read coverage of a position to identify a mutation | 10 | Optional | +| snippy_variants_wf | **min_frac** | Float | Minimum fraction of bases at a given position to identify a mutation | 0.9 | Optional | +| snippy_variants_wf | **min_quality** | Int | Minimum VCF variant call "quality" | 100 | Optional | +| snippy_variants_wf | **query_gene** | String | Indicate a particular gene of interest | | Optional | +| snippy_variants_wf | **read1** | File | Internal component, do not modify. | | Do Not Modify, Optional | +| snippy_variants_wf | **read2** | File | Internal component, do not modify. | | Do Not Modify, Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| snippy_centroid_docker | String | Docker file used for Centroid | +| snippy_centroid_fasta | File | FASTA file for the centroid sample | +| snippy_centroid_mash_tsv | File | TSV file containing mash distances computed by centroid | +| snippy_centroid_samplename | String | Name of the centroid sample | +| snippy_centroid_version | String | Centroid version used | +| snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | +| snippy_concatenated_variants | File | The concatenated variants file | +| snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | +| snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | +| snippy_final_tree | File | Final phylogenetic tree produced by Snippy_Streamline | +| snippy_gubbins_branch_stats | File | CSV file showing for each branch of the tree | +| snippy_gubbins_docker | String | Docker file used for Gubbins | +| snippy_gubbins_recombination_gff | File | Recombination statistics in GFF format; these can be viewed in Phandango against the phylogenetic tree | +| snippy_gubbins_version | String | Gubbins version used | +| snippy_iqtree2_docker | String | Docker file used for IQTree2 | +| snippy_iqtree2_model_used | String | Nucleotide substitution model used by IQTree2 | +| snippy_iqtree2_version | String | IQTree2 version used | +| snippy_msa_snps_summary | File | CSV file showing for each branch of the tree | +| snippy_ncbi_datasets_docker | String | Docker file used for NCBI datasets | +| snippy_ncbi_datasets_version | String | NCBI datasets version used | +| snippy_ref | File | Reference genome used by Snippy | +| snippy_ref_metadata_json | File | Metadata associated with the refence genome used by Snippy, in JSON format | +| snippy_referenceseeker_database | String | ReferenceSeeker database used | +| snippy_referenceseeker_docker | String | Docker file used for ReferenceSeeker | +| snippy_referenceseeker_top_hit_ncbi_accession | String | NCBI Accession for the top it identified by Assembly_Fetch | +| snippy_referenceseeker_tsv | File | TSV file of the top hits between the query genome and the Reference Seeker database | +| snippy_referenceseeker_version | String | ReferenceSeeker version used | +| snippy_snp_dists_docker | String | Docker file used for SNP-dists | +| snippy_snp_dists_version | String | SNP-dists version used | +| snippy_snp_sites_docker | String | Docker file used for SNP-sites | +| snippy_snp_sites_version | String | SNP-sites version used | +| snippy_streamline_analysis_date | String | Date of workflow run | +| snippy_streamline_version | String | Version of Snippy_Streamline used | +| snippy_summarized_data | File | CSV presence/absence matrix generated by the summarize_data task (within Snippy_Tree workflow) from the list of columns provided | +| snippy_tree_snippy_docker | String | Docker file used for Snippy in the Snippy_Tree subworkfow | +| snippy_tree_snippy_version | String | Version of Snippy_Tree subworkflow used | +| snippy_variants_outdir_tarball | Array[File] | A compressed file containing the whole directory of snippy output files. This is used when running Snippy_Tree | +| snippy_variants_snippy_docker | Array[String] | Docker file used for Snippy in the Snippy_Variants subworkfow | +| snippy_variants_snippy_version | Array[String] | Version of Snippy_Tree subworkflow used | +| snippy_wg_snp_matrix | File | CSV file of whole genome pairwise SNP distances between samples, calculated from the final alignment | diff --git a/docs/workflows/phylogenetic_construction/snippy_tree.md b/docs/workflows/phylogenetic_construction/snippy_tree.md new file mode 100644 index 000000000..9f643803d --- /dev/null +++ b/docs/workflows/phylogenetic_construction/snippy_tree.md @@ -0,0 +1,345 @@ +# Snippy_Tree + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.1.0 | Yes; some optional features incompatible | Set-level | + +## Snippy_Tree_PHB + +`Snippy_Tree` is a workflow for generating high-quality bacterial phylogenies. It produces a phylogenetic tree and pairwise SNP-distance matrix, with the option to summarize additional metadata to visualize with the tree. + +The tree produced by Snippy_Tree will always be a maximum-likelihood phylogeny using a reference-based alignment. There are key options for whether to: + +- Generate a core-genome or whole-genome phylogeny (`core_genome`) +- Mask specified regions of the genome with a bed file (e.g. known repetitive regions for TB) (`bed_file`) +- Mask recombination (`use_gubbins`) +- Decide which nucleotide substitution model to use + +### Inputs + +`Snippy_Tree` is intended to be run after the `Snippy_Variants` workflow. It is a set-level workflow that takes in an array of directories generated by the `Snippy_Variants` workflow, which must be run for each sample that you wish to include in the phylogenetic tree. You should ensure that for all samples included in the phylogeny, `Snippy_Variants` has been run with identical inputs including the same reference genome. When running the `Snippy_Tree` workflow, you will need to provide the same reference genome that you used when running `Snippy_Variants`. `Snippy_Variants` and `Snippy_Tree` can both automatically be run by using the `Snippy_Streamline` workflow. + +Sequencing data used in the Snippy_Tree workflow must: + +- Be Illumina reads +- Be generated by unbiased whole genome shotgun sequencing +- Pass appropriate QC thresholds for the taxa to ensure that the reads represent reasonably complete genomes that are free of contamination from other taxa or cross-contamination of the same taxa. +- If masking recombination with `Gubbins`, input data should represent whole genomes from the same strain/lineage (e.g. MLST) that share a recent common ancestor. + +!!! tip "Guidance for optional inputs" + + Several core and optional tasks can be used to generate the Snippy phylogenetic tree, making it highly flexible and suited to a wide range of datasets. You will need to decide which tasks to use depending on the genomes that you are analyzing. Some guidelines for the optional tasks to use for different genome types are provided below. + + ??? toggle "Default settings (suitable for most bacteria)" + + The default settings are as follows and are suitable for generating phylogenies for most bacteria + + - `core_genome` = true (creates core genome phylogeny) + - `use_gubbins` = true (recombination masked) + - nucleotide substitution model will be defined by IQTree's Model Finder + + ??? toggle "Phylogenies of _Mycobacterium tuberculosis_ complex" + + Phylogenies of MTBC are typically constructed + + - Using the H37Rv reference genome + - `reference_genome_file` = gs://theiagen-public-files-rp/terra/theiaprok-files/Mtb_NC_000962.3.fasta + - Masking repetitive regions of the genome (e.g. PE/PPE genes) that are often misaligned + - `snippy_core_bed` = gs://theiagen-public-files/terra/theiaprok-files/Mtb_NC_000962.3.bed + - Without masking recombination because TB can be considered non-recombinant + - `use_gubbins` = false + - Using the core genome + - `core_genome` = true (as default) + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| snippy_tree_wf | **tree_name_updated** | String | Internal component, do not modify. Used for replacing spaces with underscores_ | | Do not modify | +| snippy_tree_wf | **reference_genome_file** | File | Reference genome in FASTA or GENBANK format (must be the same reference used in Snippy_Variants workflow) | | Required | +| snippy_tree_wf | **samplenames** | Array[String] | Samplenames for each input genome | | Required | +| snippy_tree_wf | **snippy_variants_outdir_tarball** | Array[File] | Output from the Snippy_Variants workflow | | Required | +| snippy_tree_wf | **tree_name** | String | String of your choice to prefix output files | | Required | +| cg_reorder_matrix | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| cg_reorder_matrix | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| cg_reorder_matrix | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | +| cg_reorder_matrix | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| cg_snp_dists | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| cg_snp_dists | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| cg_snp_dists | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| concatenate_variants | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | +| gubbins | **filter_percent** | Int | Maximum % gaps to include a sample in gubbins analysis and downstream analyses | 25 | Optional | +| gubbins | **iterations** | Int | Maximum number of trees to iteratively build to remove recombination | 5 | Optional | +| gubbins | **nuc_subst_model** | String | Nucleotide substitution model to use with Gubbins: "JC", "K2P", "HKY", "GTR", "GTRGAMMA" or "GTRCAT" (see ) | GTRGAMMA | Optional | +| gubbins | **tree_args** | String | Quoted string of further arguments passed to tree building algorithm | | Optional | +| gubbins | **tree_builder** | String | Application to use for Gubbins tree building algorithm: "raxml", "raxmlng", "iqtree", "iqtree-fast", "fasttree", "hybrid" (fasttree is used for the first tree, and raxml is used for later iterations), "rapidnj" | raxml | Optional | +| iqtree2 | **alrt** | Int | Number of replicates to use for the SH-like approximate likelihood ratio test (Minimum recommended= 1000) | 1000 | Optional | +| shared_variants | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| shared_variants | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| shared_variants | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | +| shared_variants | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| snippy_tree_wf | **call_shared_variants** | Boolean | When true, workflow generates table that combines variants across all samples and a table showing variants shared across samples | TRUE | Optional | +| snippy_tree_wf | **core_genome** | Boolean | When true, workflow generates core genome phylogeny; when false, whole genome is used | TRUE | Optional | +| snippy_tree_wf | **data_summary_column_names** | String | A comma-separated list of the column names from the sample-level data table for generating a data summary (presence/absence .csv matrix) | | Optional | +| snippy_tree_wf | **data_summary_terra_project** | String | The billing project for your current workspace. This can be found after the "#workspaces/" section in the workspace's URL | | Optional | +| snippy_tree_wf | **data_summary_terra_table** | String | The name of the sample-level Terra data table that will be used for generating a data summary | | Optional | +| snippy_tree_wf | **data_summary_terra_workspace** | String | The name of the Terra workspace you are in. This can be found at the top of the webpage, or in the URL after the billing project. | | Optional | +| snippy_tree_wf | **gubbins_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_tree_wf | **gubbins_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/gubbins:3.3--py310pl5321h8472f5a_0 | Optional | +| snippy_tree_wf | **gubbins_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| snippy_tree_wf | **gubbins_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **iqtree2_bootstraps** | String | Number of replicates for (Minimum recommended= 1000) | 1000 | Optional | +| snippy_tree_wf | **iqtree2_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| snippy_tree_wf | **iqtree2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **iqtree2_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/iqtree2:2.1.2 | Optional | +| snippy_tree_wf | **iqtree2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| snippy_tree_wf | **iqtree2_model** | String | Nucelotide substitution model to use when generating the final tree with IQTree2. By default, IQtree runs its ModelFinder algorithm to identify the model it thinks best fits your dataset | | Optional | +| snippy_tree_wf | **iqtree2_opts** | String | Additional options to pass to IQTree2 | | Optional | +| snippy_tree_wf | **midpoint_root_tree** | Boolean | If true, midpoint root the final tree | | Optional | +| snippy_tree_wf | **phandango_coloring** | Boolean | Boolean variable that tells the data summary task and the reorder matrix task to include a suffix that enables consistent coloring on Phandango; by default, this suffix is not added. To add this suffix set this variable to true. | FALSE | Optional | +| snippy_tree_wf | **snippy_core_bed** | File | Bed file with locations to be masked from the core genome alignment | | Optional | +| snippy_tree_wf | **snippy_core_cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| snippy_tree_wf | **snippy_core_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **snippy_core_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snippy:4.6.0 | Optional | +| snippy_tree_wf | **snippy_core_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| snippy_tree_wf | **snp_dists_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2 | Optional | +| snippy_tree_wf | **snp_sites_cpus** | Int | CPUs to allocate to SNP-sites | 1 | Optional | +| snippy_tree_wf | **snp_sites_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_tree_wf | **snp_sites_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snp-sites:2.5.1 | Optional | +| snippy_tree_wf | **snp_sites_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| snippy_tree_wf | **use_gubbins** | Boolean | When "true", workflow removed recombination with gubbins tasks; when "false", gubbins is not used | true | Optional | +| summarize_data | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| summarize_data | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| summarize_data | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | +| summarize_data | **id_column_name** | String | Name of the column in the input table that contains the sample IDs, if different from default | | Optional | +| summarize_data | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 1 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | +| wg_reorder_matrix | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| wg_reorder_matrix | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| wg_reorder_matrix | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/mykrobe:0.12.1 | Optional | +| wg_reorder_matrix | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| wg_snp_dists | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| wg_snp_dists | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| wg_snp_dists | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | + +### Workflow Tasks + +??? task "Snippy" + + ##### Snippy {#snippy_task} + + Snippy is a pipeline for calling SNPs and INDELs in haploid genomes. Before running `Snippy_Tree`, you must run `Snippy_Variants`, another workflow that uses the Snippy tool to align reads against a reference genome for individual samples. In `Snippy_Tree`, the snippy tool is used again to generate a whole-genome multiple sequence alignment (fasta file) of reads from all the samples we'd like in our tree. + + When generating the multiple sequence alignment, a bed file can be provided by users to mask certain areas of the genome in the alignment. This is particularly relevant for masking known repetitive regions in _Mycobacterium tuberculosis_ genomes, or masking known regions containing phage sequences. + + !!! info "Why do I see `snippy_core` in Terra?" + In Terra, this task is named "snippy_core" after the name of the command in the original Snippy tool. Despite the name, this command is NOT being used to make a core genome, but instead a multiple sequence alignment of the whole genome (without any sections masked using a bed file). + + !!! techdetails "Snippy Technical Details" + + | | Links | + | --- | --- | + | Task | [task_snippy_core.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/task_snippy_core.wdl) | + | Default software version | v4.6.0 (us-docker.pkg.dev/general-theiagen/staphb/snippy:4.6.0) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | + +??? task "Gubbins (optional)" + + ##### Gubbins (optional) {#gubbins_task} + + !!! info "Optional" + Gubbins is used when `use_gubbins` is set to `true` (default=true). + + **G**enealogies **U**nbiased **B**y recom**B**inations **I**n **N**ucleotide **S**equences (Gubbins) identifies and masks genomic regions that are predicted to have arisen via recombination. It works by iteratively identifying loci containing elevated densities of SNPs and constructing phylogenies based on the putative single nucleotide variants outside these regions (for more details, see [here](https://github.com/nickjcroucher/gubbins/blob/v3.3/docs/gubbins_manual.md#description-of-the-algorithm)). By default, these phylogenies are constructed using RaxML and a GTR-GAMMA nucleotide substitution model, which will be the most suitable model for most bacterial phylogenetics, though this can be modified with the `tree_builder` and `nuc_subst_model` inputs. + + Gubbins is the industry standard for masking recombination from bacterial genomes when building phylogenies, but limitations to recombination removal exist. Gubbins cannot distinguish recombination from high densities of SNPs that may result from assembly or alignment errors, mutational hotspots, or regions of the genome with relaxed selection. The tool is also intended only to find recombinant regions that are short relative to the length of the genome, so large regions of recombination may not be masked. These factors should be considered when interpreting resulting phylogenetic trees, but overwhelmingly Gubbins improves our ability to understand ancestral relationships between bacterial genomes. + + There are few optional inputs for Gubbins that can be modified by the user: + + - `iterations`: Gubbins works by iteratively identifying loci containing elevated densities of SNPs, while constructing phylogenies based on the putative single nucleotide variants outside these regions. It may take many iterations for Gubbins to converge on an alignment that it considers free of recombination, especially for phylogenies that contain large numbers of genomes. By default, Gubbins is limited to 5 iterations though this may be increased by the user with the `iterations`optional input (incurring increased computing time and cost, and possibly requiring increased memory allocation). + - `nuc_subst_model`, `tree_builder` and `tree_args`: When Gubbins constructs phylogenies, it can use a number of phylogenetic inference tools, each with [different nucleotide substitution models](https://github.com/nickjcroucher/gubbins/blob/master/docs/gubbins_manual.md#nucleotide-substitution-model-options) and [tree-building models](https://github.com/nickjcroucher/gubbins/blob/master/docs/gubbins_manual.md#tree-building-options). By default, the `Snippy_Tree` workflow uses a GTRGAMMA substitution model and RaxML for tree building (typically suitable for bacterial genomes), but these can be modified by the user depending on the genome sequences being used with the `nuc_subst_model` and `tree_builder` optional inputs, respectively. The nucleotide substitution models that are available depend on the tree building algorithm being used (see [here](https://github.com/nickjcroucher/gubbins/blob/v3.3/docs/gubbins_manual.md#nucleotide-substitution-model-options)). Additional options for generating the phylogenetic trees in Gubbins can be specified with the `tree_args` optional input, providing an input string that is consistent with the option formats of the Gubbins command. + - `filter_percent`: By default, Gubbins removes genomes from the multiple sequence alignment if more than 25 % of the genome is represented by gaps. The percentage of gaps can be modified by the user using the `filter_percent` optional input. + + !!! techdetails "Gubbins Technical Details" + + | | Links | + | --- | --- | + | Task | [task_gubbins.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/task_gubbins.wdl) | + | Software Source Code | [Gubbins on GitHub](https://github.com/nickjcroucher/gubbins) | + | Software Documentation | [Gubbins v3.3 manual](https://github.com/nickjcroucher/gubbins/blob/v3.3/docs/gubbins_manual.md) | + | Original Publication(s) | [Rapid phylogenetic analysis of large samples of recombinant bacterial whole genome sequences using Gubbins](https://academic.oup.com/nar/article/43/3/e15/2410982) | + | Default software version | us-docker.pkg.dev/general-theiagen/biocontainers/gubbins:3.3--py310pl5321h8472f5a_0 | + +??? task "SNP-sites (optional)" + + ##### SNP-sites (optional) {#snp_sites_task} + + !!! tip "Turn on SNP-Sites with `core_genome`" + SNP-sites runs when the `core_genome` option is set to true. + + SNP-sites is used to filter out invariant sites in the whole-genome alignment, thereby creating a core genome alignment for phylogenetic inference. The output is a fasta file containing the core genome of each sample only. If Gubbins has been used, this output fasta will not contain any sites that are predicted to have arisen via recombination. + + !!! techdetails "SNP-sites technical details" + + | | Links | + | --- | --- | + | Task | [task_snp_sites.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/task_snp_sites.wdl) | + | Default software version | 2.5.1 (us-docker.pkg.dev/general-theiagen/biocontainers/snp-sites:2.5.1--hed695b0_0) | + | Software Source Code | [SNP-sites on GitHub](https://github.com/sanger-pathogens/snp-sites) | + | Software Documentation | [SNP-sites on GitHub](https://github.com/sanger-pathogens/snp-sites) | + | Original Publication(s) | [SNP-sites: rapid efficient extraction of SNPs from multi-FASTA alignments](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000056) | + +??? task "IQTree2" + + ##### IQTree2 {#iqtree2_task} + + IQTree2 is used to build the final phylogeny. It uses the alignment generated in the previous steps of the workflow. The contents of this alignment will depend on whether any sites were masked with recombination. + + The phylogeny is generated using the maximum-likelihood method and a specified nucleotide substitution model. By default, the Snippy_Tree workflow will run Model Finder to determine the most appropriate nucleotide substitution model for your data, but you may specify the nucleotide substitution model yourself using the `iqtree2_model` optional input (see [here](http://www.iqtree.org/doc/Substitution-Models) for available models). + + IQTree will perform assessments of the tree using the Shimodaira–Hasegawa approximate likelihood-ratio test ([SH-aLRT test](https://academic.oup.com/sysbio/article/59/3/307/1702850?login=false)), and ultrafast bootstrapping with [UFBoot2](https://academic.oup.com/mbe/article/35/2/518/4565479), a quicker but less biased alternative to standard bootstrapping. A clade should not typically be trusted if it has less than 80% support from the SH-aLRT test and less than 95% support with ultrafast bootstrapping. + + !!! tip "Nucleotide substitution model" + When `core_genome`= `true`, the default nucleotide substitution model is set to the General Time Reverside model with Gamma distribution (GTR+G). + + When the user sets `core_genome`= `false`, the default nucleotide substitution model is set to the General Time Reversible model with invariant sites and Gamma distribution (`GTR+I+G`). + + !!! techdetails "IQTree2 technical details" + + | | Links | + | --- | --- | + | Task | [task_iqtree2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/task_iqtree2.wdl) | + | Software Source Code | [IQ-TREE on GitHub](https://github.com/iqtree/iqtree2) | + | Software Documentation | [IQTree documentation](http://www.iqtree.org/doc/) for the latest version (not necessarily the version used in this workflow) | + | Original Publication(s) | [IQ-TREE 2: New Models and Efficient Methods for Phylogenetic Inference in the Genomic Era](https://academic.oup.com/mbe/article/37/5/1530/5721363) | + | Publication for the SH-alRT test | [New Algorithms and Methods to Estimate Maximum-Likelihood Phylogenies: Assessing the Performance of PhyML 3.0](https://academic.oup.com/sysbio/article/59/3/307/1702850?login=false) | + | Publication for ultrafast bootstrapping integration to IQTree | [Ultrafast Approximation for Phylogenetic Bootstrap](https://academic.oup.com/mbe/article/30/5/1188/997508?login=false); [UFBoot2: Improving the Ultrafast Bootstrap Approximation](https://academic.oup.com/mbe/article/35/2/518/4565479?login=false) | + | Publication for ModelFinder | [ModelFinder: fast model selection for accurate phylogenetic estimates](https://www.nature.com/articles/nmeth.4285) | + +??? task "SNP-dists" + + ##### SNP-dists {#snp_dists_task} + + `SNP-dists` computes pairwise SNP distances between genomes. It takes the same alignment of genomes used to generate your phylogenetic tree and produces a matrix of pairwise SNP distances between sequences. This means that if you generated pairwise core-genome phylogeny, the output will consist of pairwise core-genome SNP (cgSNP) distances. Otherwise, these will be whole-genome SNP distances. Regardless of whether core-genome or whole-genome SNPs, this SNP distance matrix will exclude all SNPs in masked regions (i.e. masked with a bed file or gubbins). + + The SNP-distance output can be visualized using software such as [Phandango](http://jameshadfield.github.io/phandango/#/main) to explore the relationships between the genomic sequences. The task adds a Phandango coloring tag (:c1) to the column names in the output matrix to ensure that all columns are colored with the same color scheme throughout. + + - **Technical details** + + | | Links | + | --- | --- | + | Task | [task_snp_dists.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/task_snp_dists.wdl) | + | Default software version | 0.8.2 (us-docker.pkg.dev/general-theiagen/staphb/snp-dists:0.8.2) | + | Software Source Code | [SNP-dists on GitHub](https://github.com/tseemann/snp-dists) | + | Software Documentation | [SNP-dists on GitHub](https://github.com/tseemann/snp-dists) | + | Original Publication(s) | Not known to be published | + +??? task "Data summary (optional)" + + ##### Data Summary (optional) {#data_summary_task} + + If you fill out the `data_summary_*` and `sample_names` optional variables, you can use the optional `summarize_data` task. The task takes a comma-separated list of column names from the Terra data table, which should each contain a list of comma-separated items. For example, `"amrfinderplus_virulence_genes,amrfinderplus_stress_genes"` (with quotes, comma separated, no spaces) for these output columns from running TheiaProk. The task checks whether those comma-separated items are present in each row of the data table (sample), then creates a CSV file of these results. The CSV file indicates presence (TRUE) or absence (empty) for each item. By default, the task adds a Phandango coloring tag to group items from the same column, but you can turn this off by setting `phandango_coloring` to `false`. + + ??? toggle "**Example output CSV**" + + ```text linenums="1" + Sample_Name,aph(3')-IIa,blaCTX-M-65,blaOXA-193,tet(O) + sample1,TRUE,,TRUE,TRUE + sample2,,,FALSE,TRUE + sample3,,,FALSE, + ``` + + ??? toggle "**Example use of Phandango coloring**" + + Data summary produced using the `phandango_coloring` option, visualized alongside Newick tree at + + !!! caption "Example phandango_coloring output" + ![Phandango coloring example](../../assets/figures/example_phandango_coloring.png) + + !!! techdetails "Data summary technical details" + + | | Links | + | --- | --- | + | Task | [task_summarize_data.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_summarize_data.wdl) | + +??? task "Concatenate Variants (optional)" + + ##### Concatenate Variants (optional) {#concatenate_variants_task} + + The `cat_variants` task concatenates variant data from multiple samples into a single file `concatenated_variants`. It is very similar to the `cat_files` task, but also adds a column to the output file that indicates the sample associated with each row of data. + + The `concatenated_variants` file will be in the following format: + + | samplename | CHROM | POS | TYPE | REF | ALT | EVIDENCE | FTYPE | STRAND | NT_POS | AA_POS | EFFECT | LOCUS_TAG | GENE | PRODUCT | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | sample1 | PEKT02000007 | 5224 | snp | C | G | G:21 C:0 | | | | | | | | | + | sample2 | PEKT02000007 | 34112 | snp | C | G | G:32 C:0 | CDS | + | 153/1620 | 51/539 | missense_variant c.153C>G p.His51Gln | B9J08_002604 | hypothetical protein | | + | sample3 | PEKT02000007 | 34487 | snp | T | A | A:41 T:0 | CDS | + | 528/1620 | 176/539 | missense_variant c.528T>A p.Asn176Lys | B9J08_002604 | hypothetical protein | | + + !!! techdetails "Technical Details" + + | | Links | + | --- | --- | + | Task | /tasks/utilities/file_handling/task_cat_files.wdl | + | Software Source Code | [task_cat_files.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/file_handling/task_cat_files.wdl) | + +??? task "Shared Variants Task (Optional)" + + ##### Shared Variants (optional) {#shared_variants_task} + + The `shared_variants` task takes in the `concatenated_variants` output from the `cat_variants` task and reshapes the data so that variants are rows and samples are columns. For each variant, samples where the variant was detected are populated with a "1" and samples were **either the variant was not detected or there was insufficient coverage to call variants** are populated with a "0". The resulting table is available as the `shared_variants_table` output. + + The `shared_variants_table` file will be in the following format: + + | CHROM | POS | TYPE | REF | ALT | FTYPE | STRAND | NT_POS | AA_POS | EFFECT | LOCUS_TAG | GENE | PRODUCT | sample1 | sample2 | sample3 | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | PEKT02000007 | 2693938 | snp | T | C | CDS | - | 1008/3000 | 336/999 | synonymous_variant c.1008A>G p.Lys336Lys | B9J08_003879 | NA | chitin synthase 1 | 1 | 1 | 0 | + | PEKT02000007 | 2529234 | snp | G | C | CDS | + | 282/336 | 94/111 | missense_variant c.282G>C p.Lys94Asn | B9J08_003804 | NA | cytochrome c | 1 | 1 | 1 | + | PEKT02000002 | 1043926 | snp | A | G | CDS | - | 542/1464 | 181/487 | missense_variant c.542T>C p.Ile181Thr | B9J08_000976 | NA | dihydrolipoyl dehydrogenase | 1 | 1 | 0 | + + !!! techdetails "Technical Details" + + | | Links | + | --- | --- | + | Task | task_shared_variants.wdl | + | Software Source Code | [task_shared_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_shared_variants.wdl) | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | +| snippy_concatenated_variants | File | Concatenated snippy_results file across all samples in the set | +| snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | +| snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | +| snippy_final_tree | File | Newick tree produced from the final alignment. Depending on user input for core_genome, the tree could be a core genome tree (default when core_genome is true) or whole genome tree (if core_genome is false) | +| snippy_gubbins_branch_stats | File | CSV file showing https://github.com/nickjcroucher/gubbins/blob/master/docs/gubbins_manual.md#output-statistics for each branch of the tree | +| snippy_gubbins_docker | String | Docker file used for running Gubbins | +| snippy_gubbins_recombination_gff | File | Recombination statistics in GFF format; these can be viewed in Phandango against the phylogenetic tree | +| snippy_gubbins_version | String | Gubbins version used | +| snippy_iqtree2_docker | String | Docker file used for running IQTree2 | +| snippy_iqtree2_model_used | String | Nucleotide substitution model used by IQTree2 | +| snippy_iqtree2_version | String | IQTree2 version used | +| snippy_msa_snps_summary | File | TXT file containing summary statistics for each alignment of each input genome against the reference. This indicates how good the alignment is. Pay particular attention to # unaligned sites, and heterogeneous positions. | +| snippy_ref | File | Reference genome (FASTA or GenBank file) used for generating phylogeny | +| snippy_shared_snp_table | File | Table illustrating variants shared among samples | +| snippy_snp_dists_docker | String | Docker file used for running SNP-dists | +| snippy_snp_dists_version | String | SNP-dists version used | +| snippy_snp_sites_docker | String | Docker file used for running SNP-sites | +| snippy_snp_sites_version | String | SNP-sites version used | +| snippy_summarized_data | File | CSV presence/absence matrix generated by the summarize_data task from the list of columns provided; formatted for Phandango if phandango_coloring input is true | +| snippy_tree_analysis_date | String | Date of workflow run | +| snippy_tree_snippy_docker | String | Docker file used for running Snippy | +| snippy_tree_snippy_version | String | Snippy version used | +| snippy_tree_version | String | Version of Snippy_Tree workflow | +| snippy_wg_snp_matrix | File | CSV file of whole genome pairwise SNP distances between samples, calculated from the final alignment | + +## References + +> **Gubbins:** Croucher, Nicholas J., Andrew J. Page, Thomas R. Connor, Aidan J. Delaney, Jacqueline A. Keane, Stephen D. Bentley, Julian Parkhill, and Simon R. Harris. 2015. "Rapid Phylogenetic Analysis of Large Samples of Recombinant Bacterial Whole Genome Sequences Using Gubbins." Nucleic Acids Research 43 (3): e15. + +> **SNP-sites:** Page, Andrew J., Ben Taylor, Aidan J. Delaney, Jorge Soares, Torsten Seemann, Jacqueline A. Keane, and Simon R. Harris. 2016. "SNP-Sites: Rapid Efficient Extraction of SNPs from Multi-FASTA Alignments." Microbial Genomics 2 (4): e000056. + +> **IQTree:** Nguyen, Lam-Tung, Heiko A. Schmidt, Arndt von Haeseler, and Bui Quang Minh. 2015. "IQ-TREE: A Fast and Effective Stochastic Algorithm for Estimating Maximum-Likelihood Phylogenies." Molecular Biology and Evolution 32 (1): 268–74. diff --git a/docs/workflows/phylogenetic_construction/snippy_variants.md b/docs/workflows/phylogenetic_construction/snippy_variants.md new file mode 100644 index 000000000..9f62e6018 --- /dev/null +++ b/docs/workflows/phylogenetic_construction/snippy_variants.md @@ -0,0 +1,83 @@ +# Snippy_Variants + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics), [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Sample-level | + +## Snippy_Variants_PHB + +The `Snippy_Variants` workflow aligns single-end or paired-end reads (in FASTQ format), or assembled sequences (in FASTA format), against a reference genome, then identifies single-nucleotide polymorphisms (SNPs), multi-nucleotide polymorphisms (MNPs), and insertions/deletions (INDELs) across the alignment. If a GenBank file is used as the reference, mutations associated with user-specified query strings (e.g. genes of interest) can additionally be reported to the Terra data table. + +!!! caption "Snippy_Variants Workflow Diagram" + ![Snippy_Variants Workflow Diagram](../../assets/figures/Snippy_Variants.png) + +!!! tip "Example Use Cases" + - **Finding mutations** (SNPs, MNPs, and INDELs) in your own sample's reads relative to a reference, e.g. mutations in genes of phenotypic interest. + - **Quality control:** When undertaking quality control of sequenced isolates, it is difficult to identify contamination between multiple closely related genomes using the conventional approaches in TheiaProk (e.g. isolates from an outbreak or transmission cluster). Such contamination may be identified as allele heterogeneity at a significant number of genome positions. `Snippy_Variants` may be used to identify these heterogeneous positions by aligning reads to the assembly of the same reads, or to a closely related reference genome and lowering the thresholds to call SNPs. + - **Assessing support for a mutation**: `Snippy_Variants` produces a BAM file of the reads aligned to the reference genome. This BAM file can be visualized in IGV (see Theiagen Office Hours recordings) to assess the position of a mutation in supporting reads, or if the assembly of the reads was used as a reference, the position in the contig. + - Mutations that are only found at the ends of supporting reads may be an error of sequencing. + - Mutations found at the end of contigs may be assembly errors. + +### Inputs + +- Single or paired-end reads resulting from Illumina or IonTorrent sequencing can be used. For single-end data, simply omit a value for `read2` +- Assembled genomes can be used. Use the `assembly_fasta` input and omit `read1` and `read2` +- The reference file should be in fasta (e.g. `.fa`, `.fasta`) or [full GenBank](https://github.com/tseemann/snippy/issues/463#issuecomment-863344618) (`.gbk`) format. The mutations identified by Snippy_Variants are highly dependent on the choice of reference genome. Mutations cannot be identified in genomic regions that are present in your query sequence and not the reference. + +!!! info "Query String" + The query string can be a gene or any other annotation that matches the GenBank file/output VCF **EXACTLY** + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| snippy_variants_wf | **reference_genome_file** | File | Reference genome (GenBank file or fasta) | | Required | +| snippy_variants_wf | **samplename** | String | Names of samples | | Required | +| snippy_gene_query | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| snippy_gene_query | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_gene_query | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-06-21 | Optional | +| snippy_gene_query | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| snippy_variants | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| snippy_variants_wf | **assembly_fasta** | File | Assembly file | | Optional | +| snippy_variants_wf | **base_quality** | Int | Minimum quality for a nucleotide to be used in variant calling | 13 | Optional | +| snippy_variants_wf | **cpus** | Int | Number of CPUs to use | 4 | Optional | +| snippy_variants_wf | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/snippy:4.6.0 | Optional | +| snippy_variants_wf | **map_qual** | Int | Minimum mapping quality to accept in variant calling, default from snippy tool is 60 | | Optional | +| snippy_variants_wf | **maxsoft** | Int | Number of bases of alignment to soft-clip before discarding the alignment, default from snippy tool is 10 | | Optional | +| snippy_variants_wf | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| snippy_variants_wf | **min_coverage** | Int | Minimum read coverage of a position to identify a mutation | 10 | Optional | +| snippy_variants_wf | **min_frac** | Float | Minimum fraction of bases at a given position to identify a mutation, default from snippy tool is 0 | 0.9 | Optional | +| snippy_variants_wf | **min_quality** | Int | Minimum VCF variant call "quality" | 100 | Optional | +| snippy_variants_wf | **query_gene** | String | Comma-separated strings (e.g. gene names) in which to search for mutations to output to data table | | Optional | +| snippy_variants_wf | **read1** | File | Forward read file | | Optional | +| snippy_variants_wf | **read2** | File | Reverse read file | | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Tasks + +`Snippy_Variants` uses the snippy tool to align reads to the reference and call SNPs, MNPs and INDELs according to optional input parameters. The output includes a file of variants that is then queried using the `grep` bash command to identify any mutations in specified genes or annotations of interest. The query string MUST match the gene name or annotation as specified in the GenBank file and provided in the output variant file in the `snippy_results` column. + +### Outputs + +!!! tip "Visualize your outputs in IGB" + Output bam/bai files may be visualized using IGV to manually assess read placement and SNP support. + +| **Variable** | **Type** | **Description** | +|---|---|---| +| snippy_variants_bai | File | Indexed bam file of the reads aligned to the reference | +| snippy_variants_bam | File | Bam file of reads aligned to the reference | +| snippy_variants_coverage_tsv | File | Coverage stats tsv file output by the samtools coverage command | +| snippy_variants_docker | String | Docker image for snippy variants task | +| snippy_variants_gene_query_results | File | CSV file detailing results for mutations associated with the query strings specified by the user | +| snippy_variants_hits | String | A summary of mutations associated with the query strings specified by the user | +| snippy_variants_num_reads_aligned | Int | Number of reads that aligned to the reference genome as calculated by samtools view -c command | +| snippy_variants_num_variants | Int | Number of variants detected between sample and reference genome | +| snippy_variants_outdir_tarball | File | A compressed file containing the whole directory of snippy output files. This is used when running Snippy_Tree | +| snippy_variants_percent_ref_coverage | Float | Proportion of reference genome with depth greater than or equal to min_coverage | +| snippy_variants_query | String | Query strings specified by the user when running the workflow | +| snippy_variants_query_check | String | Verification that query strings are found in the reference genome | +| snippy_variants_results | File | CSV file detailing results for all mutations identified in the query sequence relative to the reference | +| snippy_variants_summary | File | A summary TXT fie showing the number of mutations identified for each mutation type | +| snippy_variants_version | String | Version of Snippy used | +| snippy_variants_wf_version | String | Version of Snippy_Variants used | diff --git a/docs/workflows/phylogenetic_placement/samples_to_ref_tree.md b/docs/workflows/phylogenetic_placement/samples_to_ref_tree.md new file mode 100644 index 000000000..308b02b60 --- /dev/null +++ b/docs/workflows/phylogenetic_placement/samples_to_ref_tree.md @@ -0,0 +1,47 @@ +# Samples_to_Ref_Tree + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Placement](../../workflows_overview/workflows_type.md/#phylogenetic-placement) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.1.0 | Yes | Sample-level, Set-level | + +## Samples_to_Ref_Tree_PHB + +[Nextclade](https://docs.nextstrain.org/projects/nextclade/en/stable/index.html) rapidly places new samples onto an existing reference phylogenetic tree. Phylogenetic placement is done by comparing the mutations of the query sequence (relative to the reference) with the mutations of every node and tip in the reference tree, and finding the node which has the most similar set of mutations. This operation is repeated for each query sequence, until all of them are placed onto the tree. This workflow uses the Nextstrain-maintained [nextclade datasets](https://github.com/nextstrain/nextclade_data) for SARS-CoV-2, mpox, influenza A and B, and RSV-A and RSV-B. The organism must be specified as input in the field `organism`, and these align with the nextclade dataset names, i.e. " sars-cov-2", "flu_h1n1pdm_ha", "flu_h1n1pdm_na", "flu_h3n2_ha", "flu_h3n2_na", "flu_vic_ha", "flu_vic_na", "flu_yam_ha", "hMPXV", "hMPXV_B1", "MPXV", "rsv_a" and "rsv_b". + +However, nextclade can be used on any organism as long as an an existing, high-quality input reference tree with representative samples on it is provided, in addition to other optional inputs. Contact us if you need help generating your own mutation-annotated tree, or follow the instructions available on the Augur wiki [here](https://docs.nextstrain.org/projects/augur/en/stable/index.html). + +!!! info "_Placement_ not _construction_" + This workflow is not for building a tree from scratch, but rather for the placement of new sequences onto an existing high-quality input reference tree with representative samples on it. In effect, query samples are only compared to reference samples and never to the other query samples. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| nextclade_addToRefTree | **assembly_fasta** | File | A fasta file with query sequence(s) to be placed onto the global tree | | Required | +| nextclade_addToRefTree | **nextclade_dataset_name** | String | What nextclade dataset name to run nextclade on; the options are: "sars-cov-2", "flu_h1n1pdm_ha", "flu_h1n1pdm_na", "flu_h3n2_ha", "flu_h3n2_na", "flu_vic_ha", "flu_vic_na", "flu_yam_ha", "hMPXV", "hMPXV_B1", "MPXV", "rsv_a" and "rsv_b" | | Required | +| nextclade_add_ref | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| nextclade_add_ref | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| nextclade_add_ref | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.3.1 | Optional | +| nextclade_add_ref | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| nextclade_add_ref | **verbosity** | String | Set the nextclade output verbosity level. Options: off, error, warn, info, debug, trace | "warn" | Optional | +| nextclade_addToRefTree | **dataset_tag** | String | nextclade dataset tag | Uses the dataset tag associated with the nextclade docker image version | Optional | +| nextclade_addToRefTree | **gene_annotations_gff** | File | A genome annotations file for codon-aware alignment, gene translation and calling of aminoacid mutations | Uses the genome annotation associated with the nextclade dataset name | Optional | +| nextclade_addToRefTree | **input_ref** | File | An optional FASTA file containing reference sequence. This file should contain exactly 1 sequence. | Uses the reference fasta associated with the specified nextclade dataset name | Optional | +| nextclade_addToRefTree | **nextclade_pathogen_json** | File | An optional pathogen JSON file containing configuration and data specific to a pathogen. | Uses the reference pathogen JSON file associated with the specified nextclade dataset name | Optional | +| nextclade_addToRefTree | **reference_tree_json** | File | An optional phylogenetic reference tree file which serves as a target for phylogenetic placement | Uses the reference tree associated with the specified nextclade dataset name | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| treeUpdate_auspice_json | File | Phylogenetic tree with user placed samples | +| treeUpdate_nextclade_docker | String | Nextclade docker image used | +| treeUpdate_nextclade_json | File | JSON file with the results of the Nextclade analysis | +| treeUpdate_nextclade_tsv | File | Tab-delimited file with Nextclade results | +| treeUpdate_nextclade_version | String | Nextclade version used | +| samples_to_ref_tree_analysis_date | String | Date of analysis | +| samples_to_ref_tree_version | String | Version of the Public Health Bioinformatics (PHB) repository used | diff --git a/docs/workflows/phylogenetic_placement/usher.md b/docs/workflows/phylogenetic_placement/usher.md new file mode 100644 index 000000000..ccd09f92f --- /dev/null +++ b/docs/workflows/phylogenetic_placement/usher.md @@ -0,0 +1,43 @@ +# Usher + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Phylogenetic Placement](../../workflows_overview/workflows_type.md/#phylogenetic-placement) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.1.0 | Yes | Sample-level, set-level | + +## Usher_PHB + +[UShER](https://usher-wiki.readthedocs.io/en/latest/) (Ultrafast Sample Placement on Existing Trees) rapidly places new samples onto an existing phylogeny using maximum parsimony. This workflow uses the UCSC-maintained global trees for SARS-CoV-2, mpox, RSV-A, and RSV-B if those organisms are specified in the `organism` input field. However, UShER can be used on any organism as long as a mutation-annotated tree (MAT) is provided in protobuf format. Contact us if you need help generating your own mutation-annotated tree, or follow the instructions available on the UShER wiki [here](https://usher-wiki.readthedocs.io/en/latest/). + +### Inputs + +While this workflow is technically a set-level workflow, it works on the sample-level too. When run on the set-level, the samples are placed with respect to each other. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | | +|---|---|---|---|---|---| +| usher_workflow | **assembly_fasta** | Array[File] | The assembly files for the samples you want to place on the pre-existing; can either be a set of samples, an individual sample, or multiple individual samples | | Required | +| usher_workflow | **organism** | String | What organism to run UShER on; the following organism have default global phylogenies and reference files provided: sars-cov-2, mpox, RSV-A, RSV-B. | | Required | +| usher_workflow | **tree_name** | String | The output prefix for the uncondensed tree output and the clades output. | | Required | +| usher | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| usher | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 200 | Optional | +| usher | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/pathogengenomics/usher:0.6.2 | Optional | +| usher | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| usher | **mutation_annotated_tree_pb** | File | Required for organisms other than sars-cov-2, mpox, RSV-A or RSV-B. This is the mutation-annotated global phylogeny upon which your samples will be placed | | Optional, Required | +| usher | **reference_genome** | File | Required for organisms other than sars-cov-2, mpox, RSV-A or RSV-B. This is the reference genome used to determine your sequence's mutations to accurately place the sample on the phylogeny. | | Optional, Required | +| usher | **subtree_size** | Int | Indicates how many of the closest-related samples you want to show in a subtree; more subtrees are made if there is more sequence diversity in the set of input samples (multiple subtrees are only generated if this workflow is run on the set level). | 20 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| usher_clades | File | The clades predicted for the samples | +| usher_phb_analysis_date | String | The date the analysis was run | +| usher_phb_version | String | The version of PHB the workflow is from | +| usher_protobuf_version | String | The version of the mutation-annotated protobuf tree (what day and what samples are included, if a default organism was used; otherwise, says it was user-provided) | +| usher_subtree_mutations | Array[File] | An array of files showing the mutations at each internal node for the subtree | +| usher_subtrees | Array[File] | An array of subtrees where your samples have been placed | +| usher_uncondensed_tree | File | The entire global tree with your samples included (warning: may be a very large file if the organism is "sars-cov-2") | +| usher_version | String | The version of UShER used | diff --git a/docs/workflows/public_data_sharing/mercury_prep_n_batch.md b/docs/workflows/public_data_sharing/mercury_prep_n_batch.md new file mode 100644 index 000000000..4d5029e43 --- /dev/null +++ b/docs/workflows/public_data_sharing/mercury_prep_n_batch.md @@ -0,0 +1,145 @@ +# Mercury_Prep_N_Batch + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Set-level | + +## Mercury_Prep_N_Batch_PHB + +Mercury prepares and formats metadata and sequencing files **located in Google Cloud Platform (GCP) buckets** for submission to national & international databases, currently NCBI & GISAID. Mercury was initially developed to ingest read, assembly, and metadata files associated with SARS-CoV-2 amplicon reads from clinical samples and format that data for submission per the [Public Health Alliance for Genomic Epidemiology (PH4GE)'s SARS-CoV-2 Contextual Data Specifications](https://github.com/pha4ge/SARS-CoV-2-Contextual-Data-Specification). + +Currently, Mercury supports submission preparation for SARS-CoV-2, mpox, and influenza. These organisms have different metadata requirements, and are submitted to different repositories; the following table lists the repositories for each organism & what is supported in Mercury: + +| | BankIt (NCBI) | BioSample (NCBI) | GenBank (NCBI) | GISAID | SRA (NCBI) | +| --- | --- | --- | --- | --- | --- | +| **`"flu"`** | | ✓ | | | ✓ | +| **`"mpox"`** | ✓ | ✓ | | ✓ | ✓ | +| **`"sars-cov-2"`** | | ✓ | ✓ | ✓ | ✓ | + +!!! dna "Mercury expects data tables made with TheiaCoV" + Mercury was designed to work with metadata tables that were partially created after running the TheiaCoV workflows. If you are using a different pipeline, please ensure that the metadata table is formatted correctly. See [this file](https://github.com/theiagen/mercury/blob/main/mercury/Metadata.py) for the hard-coded list of all of the different metadata fields expected for each organism. + +### Metadata Formatters + +To help users collect all required metadata, we have created the following Excel spreadsheets that can help you collect the necessary metadata and allow for easy upload of this metadata into your Terra data tables: + +??? toggle "**_For flu_**" + + [Flu Metadata Formatter](../../assets/metadata_formatters/Terra_2_NCBI-PATHOGEN-metadata-2024-04-30.xlsx) + + Flu uses the same metadata formatter as the Terra_2_NCBI Pathogen BioSample package. + + If neither `strain` nor `isolate` are found in the Terra data table, Mercury will automatically generate an isolate, using the following format + `ABRicate flu type / State / sample name / year (ABRicate flu subtype)`. Example: `A/California/Sample-01/2024 (H1N1)` + + The ABRicate flu type and subtype (`abricate_flu_type` and `abricate_flu_subtype` columns) are extracted from your table, and are required to generate the isolate field if it is not provided. + +??? toggle "**_For mpox_**" + + [Mpox Metadata Formatter](../../assets/metadata_formatters/Mercury_Prep_N_Batch_MPXV_Metadata_Formatter_2022_12_23.xlsx) + +??? toggle "**_For sars-cov-2_**" + + [SARS-CoV-2 Metadata Formatter](../../assets/metadata_formatters/Mercury_Prep_N_Batch_SC2_Metadata_Formatter_2023_05_22.xlsx) + +!!! dna "Usage on Terra" + + ##### Usage on Terra {#usage-on-terra} + + **A note on the `using_clearlabs_data` & `using_reads_dehosted` optional input parameters** + + The `using_clearlabs_data` and `using_reads_dehosted` arguments change the default values for the `read1_column_name`, `assembly_fasta_column_name`, and `assembly_mean_coverage_column_name` metadata columns. The default values are shown in the table below in addition to what they are changed to depending on what arguments are used. + + | Variable | Default Value | with `using_clearlabs_data` | with `using_reads_dehosted` | with both  `using_clearlabs_data` ***and*** `using_reads_dehosted` | + | --- | --- | --- | --- | --- | + | `read1_column_name` | `"read1_dehosted"` | `"clearlabs_fastq_gz"` | `"reads_dehosted"` | `"reads_dehosted"` | + | `assembly_fasta_column_name` | `"assembly_fasta"` | `"clearlabs_fasta"` | `"assembly_fasta"` | `"clearlabs_fasta"` | + | `assembly_mean_coverage_column_name` | `"assembly_mean_coverage"` | `"clearlabs_assembly_coverage"` | `"assembly_mean_coverage"` | `"clearlabs_assembly_coverage"` | + +### Inputs + +This workflow runs on the set-level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | | +|---|---|---|---|---|---| +| mercury_prep_n_batch | **gcp_bucket_uri** | String | Google bucket where your SRA reads will be temporarily stored before transferring to SRA. Example: "gs://theiagen_sra_transfer" | | Required | +| mercury_prep_n_batch | **sample_names** | Array[String] | The samples you want to submit | | Required | +| mercury_prep_n_batch | **terra_project_name** | String | The name of your Terra project. You can find this information in the URL of the webpage of your Terra dashboard. For example, if your URL contains #workspaces/example/my_workspace/ then your project name is example | | Required | +| mercury_prep_n_batch | **terra_table_name** | String | The name of the Terra table where your samples can be found. Do not include the entity: prefix or the _id suffix, just the name of the table as listed in the sidebar on lefthand side of the Terra Data tab. | | Required | +| mercury_prep_n_batch | **terra_workspace_name** | String | The name of your Terra workspace where your samples can be found. For example, if your URL contains #workspaces/example/my_workspace/ then your project name is my_workspace | | Required | +| download_terra_table | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| download_terra_table | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 10 | Optional | +| download_terra_table | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-06-21 | Optional | +| download_terra_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 1 | Optional | +| mercury | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| mercury | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| mercury | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.7 | Optional | +| mercury | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| mercury | **number_N_threshold** | Int | Only for "sars-cov-2" submissions; used to filter out any samples that contain more than the indicated number of Ns in the assembly file | 5000 | Optional | +| mercury | **single_end** | Boolean | Set to true if your data is single-end; this ensures that a read2 column is not included in the metadata | FALSE | Optional | +| mercury | **skip_county** | Boolean | Use if your Terra table contains a county column that you do not want to include in your submission. | FALSE | Optional | +| mercury | **usa_territory** | Boolean | If true, the "state" column will be used in place of the "country" column. For example, if "state" is Puerto Rico, then the GISAID virus name will be `hCoV-19/Puerto Rico//`. The NCBI `geo_loc_name` will be "USA: Puerto Rico". This optional Boolean variable should only be used with clear understanding of what it does. | FALSE | Optional | +| mercury | **using_clearlabs_data** | Boolean | When set to true will change read1_dehosted → clearlabs_fastq_gz; assembly_fasta → clearlabs_fasta; assembly_mean_coverage → clearlabs_assembly_coverage | FALSE | Optional | +| mercury | **using_reads_dehosted** | Boolean | When set to true will only change read1_dehosted → reads_dehosted. Takes priority over the replacement for read1_dehosted made with the using_clearlabs_data Boolean input | FALSE | Optional | +| mercury | **vadr_alert_limit** | Int | Only for "sars-cov-2" submissions; used to filter out any samples that contain more than the indicated number of vadr alerts | 0 | Optional | +| mercury_prep_n_batch | **authors_sbt** | File | Only for "mpox" submissions; a file that contains author information. This file can be created here: | | Optional | +| mercury_prep_n_batch | **organism** | String | The organism that you want submission prepare for — each organism requires different metadata fields so please ensure this field is accurate. Options: "flu", "mpox"" or "sars-cov-2" | sars-cov-2 | Optional | +| mercury_prep_n_batch | **output_name** | String | Free text prefix for all output files | mercury | Optional | +| mercury_prep_n_batch | **skip_ncbi** | Boolean | Set to true if you only want to prepare GISAID submission files | FALSE | Optional | +| table2asn | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| table2asn | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| table2asn | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/ncbi-table2asn:1.26.678 | Optional | +| table2asn | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 1 | Optional | +| trim_genbank_fastas | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| trim_genbank_fastas | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| trim_genbank_fastas | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/vadr:1.3 | Optional | +| trim_genbank_fastas | **max_length** | Int | Only for "sars-cov-2" submissions; the maximum genome length for trimming terminal ambiguous nucleotides. If your sample's genome is higher than this value, the workflow will error/fail. | 30000 | Optional | +| trim_genbank_fastas | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| trim_genbank_fastas | **min_length** | Int | Only for "sars-cov-2" submissions; the minimum genome length for trimming terminal ambiguous nucleotides. If your sample's genome is lower than this value, the workflow will error/fail. | 50 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| bankit_sqn_to_email | File | **Only for mpox submission**: the sqn file that you will use to submit mpox assembly files to NCBI via email | +| biosample_metadata | File | BioSample metadata TSV file for upload to NCBI | +| excluded_samples | File | A file that contains the names and reasons why a sample was excluded from submission. **For SARS-CoV-2**, there are two sections: First, a section for any samples that failed to meet pre-determined quality thresholds (`number_N` and `vadr_num_alert`). Second, a section that includes a table that describes any missing required metadata for each sample. This table has the sample name for rows and any columns that have missing metadata as headers. If a sample is missing a piece of required metadata, the corresponding cell will be blank. However, if a different sample does have metadata for that column, the associated value will appear in the corresponding cell. **For flu and mpox**, only the second section described above exists. _Please see the example below for more details_. | +| genbank_fasta | File | **Only for SARS-CoV-2 submission**: GenBank fasta file for upload | +| genbank_metadata | File | **Only for SARS-CoV-2 submission**: GenBank metadata for upload | +| gisaid_fasta | File | **Only for mpox and SARS-CoV-2 submission**: GISAID fasta file for upload | +| gisaid_metadata | File | **Only for mpox and SARS-CoV-2 submission**: GISAID metadata for upload | +| mercury_prep_n_batch_analysis_date | String | Date analysis was run | +| mercury_prep_n_batch_version | String | Version of the PHB repository that hosts this workflow | +| mercury_script_version | String | Version of the Mercury tool that was used in this workflow | +| sra_metadata | File | SRA metadata TSV file for upload | + +???+ toggle "An example excluded_samples.tsv file" + + ##### An example excluded_samples.tsv file {#example-excluded-samples} + + Due to the nature of tsv files, it may be easier to download and open this file in Excel. + + [example_excluded_samples.tsv](../../assets/files/example_excluded_samples.tsv) + + ``` + Samples excluded for quality thresholds: + sample_name message + sample2 VADR skipped due to poor assembly + sample3 VADR number alerts too high: 3 greater than limit of 0 + sample4 Number of Ns was too high: 10000 greater than limit of 5000 + + Samples excluded for missing required metadata (will have empty values in indicated columns): + tablename_id organism country library_layout + sample5 paired + sample6 SARS-CoV-2 USA + ``` + + This example informs the user that samples 2-4 were excluded for quality reasons (the exact reason is listed in the `message` column), and that samples 5 and 6 were excluded because they were missing required metadata fields (sample5 was missing the `organism` and `country` fields, and sample6 was missing the `library_layout` field). + +## Usage outside of Terra + +This tool can also be used on the command-line. Please see [the Mercury GitHub](https://github.com/theiagen/mercury) for more information on how to run Mercury with a Docker image or in your local command-line environment. diff --git a/docs/workflows/public_data_sharing/terra_2_gisaid.md b/docs/workflows/public_data_sharing/terra_2_gisaid.md new file mode 100644 index 000000000..902521641 --- /dev/null +++ b/docs/workflows/public_data_sharing/terra_2_gisaid.md @@ -0,0 +1,54 @@ +# Terra_2_GISAID + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v1.2.1 | Yes | Set-level | + +## Terra_2_GISAID_PHB + +Terra_2_GISAID programmatically submits SARS-CoV-2 assembly files to GISAID. + +This workflow expects data that has been prepared for submission using either Mercury_Batch or Mercury_Prep_N_Batch (recommended). + +!!! dna "client-ID" + To obtain a client-ID, contact `clisupport@gisaid.org` and include your username in your request. + +### Inputs + +The optional variable `frameshift_notification` has three options that correspond to the associated web-browser options: + +- "**catch_all**" - "Notify me about ALL DETECTED FRAMESHIFTS in this submission for reconfirmation of affected sequences" +- "**catch_novel**" [DEFAULT] - "Notify me only about NOT PREVIOUSLY REPORTED FRAMESHIFTS in this submission for reconfirmation of affected sequences" +- "**catch_none**" - "I confirm ANY FRAMESHIFTS in this submission and request their release without confirmation by a curator" + +!!! warning "GISAID Credentials" + Please note that the user must provide either an authentication_file or a gisaid_credentials file to run this workflow; explanations for both can be found in the table below. + +This workflow runs on the sample level. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| Terra_2_GISAID | **client_id** | String | This value should be filled with the client-ID provided by GISAID | | Required | +| Terra_2_GISAID | **concatenated_fastas** | File | The GISAID FASTA file generated by Mercury_Prep_N_Batch (or Mercury_Prep) | | Required | +| Terra_2_GISAID | **concatenated_metadata** | File | The GISAID metadata file generated by Mercury_Prep_N_Batch (or Mercury_Prep) | | Required | +| gisaid_upload | **authentication_file** | File | [EITHER] The GISAID authentication file generated by running cli3 authenticate for the submitter. | | Optional, Required | +| gisaid_upload | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| gisaid_upload | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| gisaid_upload | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/broadinstitute/gisaid-cli:3.0 | Optional | +| gisaid_upload | **frameshift_notification** | String | See top of inputs section for explanation; the notification preference regarding frameshifts in your submission | catch_novel | Optional | +| gisaid_upload | **gisaid_credentials** | File | [EITHER] A tab-delimited file containing the submitter's GISAID username followed by their password, used to generate the GISAID authentication file. | | Optional, Required | +| gisaid_upload | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| failed_uploads | Boolean | The metadata for any failed uploads | +| gisaid_cli_version | String | The verison of the GISAID CLI tool | +| gisaid_logs | File | The log files regarding the submission | +| terra_2_gisaid_analysis_date | String | The date of the analysis | +| terra_2_gisaid_version | String | The version of the PHB repository that this workflow is hosted in | diff --git a/docs/workflows/public_data_sharing/terra_2_ncbi.md b/docs/workflows/public_data_sharing/terra_2_ncbi.md new file mode 100644 index 000000000..e96b6d676 --- /dev/null +++ b/docs/workflows/public_data_sharing/terra_2_ncbi.md @@ -0,0 +1,226 @@ +# Terra_2_NCBI + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Bacteria](../../workflows_overview/workflows_kingdom.md#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics) [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.1.0 | No | Set-level | + +## Terra_2_NCBI_PHB + +!!! warning "Do not resubmit!" + **If the Terra_2_NCBI workflow fails, DO NOT resubmit.** + + Resubmission risks duplicate submissions and future failures. + + Contact Theiagen (`support@theiagen.com`) to determine the reason for failure, and **only move forward with Theiagen's guidance**. + +!!! dna "Key Resources" + - [Pathogen metadata formatter](../../assets/metadata_formatters/Terra_2_NCBI-PATHOGEN-metadata-2024-04-30.xlsx) + - [Microbe metadata formatter](../../assets/metadata_formatters/Terra_2_NCBI-MICROBE-metadata-2022-07-11.xlsx) + - [Virus metadata formatter](../../assets/metadata_formatters/Terra_2_NCBI-VIRUS-metadata-2022-09-09.xlsx) + +The Terra_2_NCBI workflow is a programmatic data submission method to share metadata information with NCBI BioSample and paired-end Illumina reads with NCBI SRA directly from Terra without having to use the NCBI portal. + +### Prerequisites + +??? toggle "Before running the Terra_2_NCBI workflow" + + 1. The user **must** have access to the NCBI FTP. To gain these credentials, we recommend emailing `**sra@ncbi.nlm.nih.gov**` a variation of the following example, including all the information: + + > Hello, + > + >We would like to automate submissions to the Submission Portal using XML metadata to accompany our cloud-hosted data files.  We would like to upload via FTP and need to create a submission group. + > + >Here is the relevant information: + > + >1. Suggested group abbreviation: + >2. Full group name: + >3. Institution and department: + >4. Contact person (someone likely to remain at the location for an extended time): + >5. Contact email: + >6. Mailing address (including country and postcode): + > + >We will be using an existing submission pipeline that is known to work and would like to request that the production folder be activated. Thank you for your assistance! + + 2. From NCBI, you will need to get in response: + 1. an FTP address (it will likely be ftp-private.ncbi.nih.gov) + 2. Username (typically the suggested group abbreviation) + 3. Password + 4. an acknowledgment that the production folder has been activated. + + Please confirm that the production folder has been activated, or else the submission pipeline will either fail or only run test submissions and not actually submit to NCBI. + + 3. Before you can run the workflow for the first time, we also recommend scheduling a meeting with Theiagen to get additional things set up, including + - adding a correctly-formatted configuration file to your workspace data elements that includes your FTP username and password, laboratory details, and other important information. + - ensuring your proxy account has been given permission to write to the google bucket where SRA reads are temporarily stored before being transferred to NCBI. + + ??? toggle "What is the configuration file used for?" + The configuration file tells the workflow your username and password so you can access the FTP. It also provides important information about who should be contacted regarding the submission. We recommend contacting a member of Theiagen for help in the creation of this configuration file to ensure that everything is formatted correctly. + +### Collating BioSample Metadata + +In order to create BioSamples, you need to choose the correct BioSample package and have the appropriate metadata included in your data table. + +Currently, Terra_2_NCBI only supports _Pathogen_, _Virus_, and _Microbe_ BioSample packages. **Most organisms should be submitted using the Pathogen package** unless you have been specifically directed otherwise (either through CDC communications or another reliable source). Definitions of packages supported by Terra_2_NCBI are listed below with more requirements provided via the links: + +- [Pathogen.cl](https://www.ncbi.nlm.nih.gov/biosample/docs/packages/Pathogen.cl.1.0/) - any clinical or host-associated pathogen +- [Pathogen.env](https://www.ncbi.nlm.nih.gov/biosample/docs/packages/Pathogen.env.1.0/) - environmental, food or other pathogen *(no metadata formatter available at this time)* +- [Microbe](https://www.ncbi.nlm.nih.gov/biosample/docs/packages/Microbe.1.0/) - bacteria or other unicellular microbes that do not fit under the MIxS, Pathogen, or Virus packages. +- [Virus](https://www.ncbi.nlm.nih.gov/biosample/docs/packages/Virus.1.0/) - viruses **not** directly associated with disease + - Viral pathogens should be submitted using the Pathogen: Clinical or host-associated pathogen package. + +### Metadata Formatters + +For each package, we have created a metadata template spreadsheet to help you organize your metadata: + +Please note that the pathogen metadata formatter is for the _clinical_ pathogen package, not the environmental pathogen. + +- [Terra_2_NCBI-PATHOGEN-metadata-2024-04-30.xlsx](../../assets/metadata_formatters/Terra_2_NCBI-PATHOGEN-metadata-2024-04-30.xlsx) +- [Terra_2_NCBI-MICROBE-metadata-2022-07-11.xlsx](../../assets/metadata_formatters/Terra_2_NCBI-MICROBE-metadata-2022-07-11.xlsx) +- [Terra_2_NCBI-VIRUS-metadata-2022-09-09.xlsx](../../assets/metadata_formatters/Terra_2_NCBI-VIRUS-metadata-2022-09-09.xlsx) + +We are constantly working on improving these spreadsheets and they will be updated in due course. + +### Running the Workflow + +We recommend running a test submission before your first production submission to ensure that all data has been formatted correctly. Please contact Theiagen (`support@theiagen.com`) to get this set up. + +In the test submission, any real BioProject accession numbers you provide will not be recognized. You will have to make a "fake" or "test" BioProject. This cannot be done through the NCBI portal. Theiagen can provide assistance in creating this as it requires manual command-line work on the NCBI FTP using the account they provided for you. + +??? toggle "**What's the difference between a test submission and a production submission?**" + + A production submission means that your submission using Terra_2_NCBI will be submitted to NCBI as if you were using the online portal. That means that anything you submit on production will be given to the ****real** **NCBI servers and appear and become searchable on the NCBI website. + + A test submission gives your data to a completely detached **replica** of the production server. This means that any data you submit as a test will behave exactly **like a real submission, but since it's detached, nothing **will appear on the NCBI website, and anything returned from the workflow (such as BioSample accession numbers) will be fake. If you search for these test BioSample accession numbers on the NCBI website, either (a) nothing will appear, or (b) it will link to a random sample. + + If you want your data to be on NCBI, you must run a production submission. Initially, NCBI locks the production folder so that the user doesn't accidentally submit test data to the main database. You must have requested activation of the production folder prior to your first production submission. + +### Inputs + +This workflow runs on set-level data tables. + +!!! info "Production Submissions" + Please note that an optional Boolean variable, `submit_to_production`, is **required** for a production submission. + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +| --- | --- | --- | --- | --- | --- | +| Terra_2_NCBI | **bioproject** | String | BioProject accession that the samples will be submitted to | | Required | +| Terra_2_NCBI | **biosample_package** | String | The BioSample package that the samples will be submitted under | | Required | +| Terra_2_NCBI | **ncbi_config_js** | File | Configuration file that contains your username and password for the NCBI FTP | | Required | +| Terra_2_NCBI | **project_name** | String | The name of your Terra project. You can find this information in the url of the webpage you are on. It is the section right after "#workspaces/" | | Required | +| Terra_2_NCBI | **sample_names** | Array[String] | The list of samples you want to submit | | Required | +| Terra_2_NCBI | **sra_transfer_gcp_bucket** | String | Google bucket where your SRA reads will be temporarily stored before transferring to SRA | | Required | +| Terra_2_NCBI | **table_name** | String | The name of the Terra table where your samples are found | | Required | +| Terra_2_NCBI | **workspace_name** | String | The name of the workspace where your samples are found | | Required | +| add_biosample_accessions | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| add_biosample_accessions | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| add_biosample_accessions | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/broadinstitute/ncbi-tools:2.10.7.10" | Optional | +| add_biosample_accessions | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| biosample_submit_tsv_ftp_upload | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| biosample_submit_tsv_ftp_upload | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| biosample_submit_tsv_ftp_upload | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/broadinstitute/ncbi-tools:2.10.7.10" | Optional | +| biosample_submit_tsv_ftp_upload | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| ncbi_sftp_upload | **additional_files** | Array[File] | Internal component; do not modify | [] | Optional | +| ncbi_sftp_upload | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| ncbi_sftp_upload | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| ncbi_sftp_upload | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/broadinstitute/ncbi-tools:2.10.7.10" | Optional | +| ncbi_sftp_upload | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| ncbi_sftp_upload | **wait_for** | String | Internal component; do not modify | "1" | Optional | +| prune_table | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| prune_table | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| prune_table | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/broadinstitute/ncbi-tools:2.10.7.10" | Optional | +| prune_table | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| prune_table | **read1_column_name** | String | The column header of the read1 column | | Optional | +| prune_table | **read2_column_name** | String | The column header of the read1 column | | Optional | +| sra_tsv_to_xml | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| sra_tsv_to_xml | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| sra_tsv_to_xml | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/broadinstitute/ncbi-tools:2.10.7.10" | Optional | +| sra_tsv_to_xml | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| Terra_2_NCBI | **input_table** | File | Internal component; do not modify | | Optional | +| Terra_2_NCBI | **skip_biosample** | Boolean | Boolean switch to turn on actual production level submission | false | Optional | +| Terra_2_NCBI | **submit_to_production** | Boolean | Used to indicate whether or not the workflow should submit to NCBI's production environment. If set to true, then a Production submission will occur. Otherwise, by default (false), it will perform a Test submission. | false | Optional, Required | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +??? task "Workflow Tasks" + + ##### Workflow Tasks {#workflow-tasks} + + The workflow will perform the following tasks, each highlighted as `code` + + 1. `prune_table`formats all incoming metadata for submission. + 2. If you are submitting BioSamples: + 1. `biosample_submit_tsv_ftp_upload` will + 1. format the BioSample table into XML format + 2. submit BioSamples to NCBI + 3. return all NCBI communications in XML format, and + 4. parse those communications for any and all BioSample accessions. + 2. `add_biosample_accessions` will + 1. add the BioSample accessions to SRA metadata + 2. upload the BioSample accessions to the origin Terra table + + If BioSample accessions fail to be generated, this task ends the workflow and users should contact Theiagen for further support. Otherwise, the workflow will continue and outputs are returned to the Terra data table. + + 3. If BioSample accessions were generated or if BioSample submission was skipped + 1. `sra_tsv_to_xml` converts the SRA metadata (including any generated or pre-provided BioSample accessions) into XML format. + 2. `ncbi_sftp_upload` + 1. uploads the SRA metadata to NCBI + 2. returns any XML communications from NCBI. + +#### Workflow Success + +If the workflow ends successfully, it returns the outputs to the Terra data table and the XML communications from NCBI will say that submission is underway. The workflow does not declare successful sample submission since SRA sometimes takes a while to do this. If the submission was successful, the point of contact for the submission will receive the SRA accessions via email from NCBI. + +If the workflow ends unsuccessfully, no outputs will be shown on Terra and the `biosample_status` output variable will indicate that the BioSample submission failed. + +### Outputs + +The output files contain information mostly for debugging purposes. Additionally, if your submission is successful, the point of contact for the submission should also receive an email from NCBI notifying them of their submission success. + +| Variable | Description | Type | +| --- | --- | --- | +| biosample_failures | Text file listing samples that failed BioSample submission | File | +| biosample_metadata | Metadata used for BioSample submission in proper BioSample formatting | File | +| biosample_report_xmls | One or more XML files that contain the response from NCBI regarding your BioSample submission. These can be pretty cryptic, but often contain information to determine if anything went wrong | Array[File] | +| biosample_status | String showing whether BioSample submission was successful | String | +| biosample_submission_xml | XML file used to submit your BioSamples to NCBI | File | +| excluded_samples | Text file listing samples that were excluded from BioSample submission for missing required metadata | File | +| generated_accessions | Text file mapping the BioSample accession with its sample name. | File | +| sra_metadata | Metadata used for SRA submission in proper SRA formatting | File | +| sra_report_xmls | One or more XML files containing the response from NCBI regarding your SRA submission. These can be pretty cryptic, but often contain information to determine if anything went wrong | Array[File] | +| sra_submission_xml | XML file that was used to submit your SRA reads to NCBI | File | +| terra_2_ncbi_analysis_date | Date that the workflow was run | String | +| terra_2_ncbi_version | Version of the PHB repository where the workflow is hosted | String | + +???+ toggle "An example excluded_samples.tsv file" + + ##### An example excluded_samples.tsv file {#example-excluded-samples} + + Due to the nature of tsv files, it may be easier to download and open this file in Excel. + + [example_excluded_samples.tsv](../../assets/files/example_excluded_samples.tsv) + + ``` + Samples excluded for quality thresholds: + sample_name message + sample2 VADR skipped due to poor assembly + sample3 VADR number alerts too high: 3 greater than limit of 0 + sample4 Number of Ns was too high: 10000 greater than limit of 5000 + + Samples excluded for missing required metadata (will have empty values in indicated columns): + tablename_id organism country library_layout + sample5 paired + sample6 SARS-CoV-2 USA + ``` + + This example informs the user that samples 2-4 were excluded for quality reasons (the exact reason is listed in the `message` column), and that samples 5 and 6 were excluded because they were missing required metadata fields (sample5 was missing the `organism` and `country` fields, and sample6 was missing the `library_layout` field). + +### Limitations + +- The maximum number of samples that can be submitted at once appears to be 300. We recommend submitting less than 300 samples at a time to avoid errors due to large submission sizes. +- A workflow on returning SRA accessions using the generated BioSample accessions is in progress. + +### Acknowledgments + +This workflow would not have been possible without the invaluable contributions of Dr. Danny Park. diff --git a/docs/workflows/standalone/cauris_cladetyper.md b/docs/workflows/standalone/cauris_cladetyper.md new file mode 100644 index 000000000..eca2023ec --- /dev/null +++ b/docs/workflows/standalone/cauris_cladetyper.md @@ -0,0 +1,22 @@ +# Cauris_CladeTyper + +!!! warning "NEEDS WORK!!!!" + This page is under construction and will be updated soon. + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics) | PHB v1.0.0 | Yes | Sample-level | + +## Cauris_CladeTyper_PHB + +The Cauris_CladeTyper_PHB Workflow is designed to assign clade to _Candida auris_ Whole Genome Sequencing assemblies based on their genomic sequence similarity to the five clade-specific reference files. Clade typing is essential for understanding the epidemiology and evolutionary dynamics of this emerging multidrug-resistant fungal pathogen. + +### Inputs + +### Workflow Tasks + +The Cauris_Cladetyper Workflow for _Candida auris_ employs GAMBIT for taxonomic identification, comparing whole genome sequencing data against reference databases to accurately classify _Candida auris_ isolates. A custom database featuring five clade-specific _Candida auris_ reference genomes facilitates clade typing. Sequences undergo genomic signature comparison against the custom database, enabling assignment to one of the five _Candida auris_ clades (Clade I to Clade V) based on sequence similarity and phylogenetic relationships. This integrated approach ensures precise clade assignments, crucial for understanding the genetic diversity and epidemiology of _Candida auris_. + +### Outputs diff --git a/docs/workflows/standalone/gambit_query.md b/docs/workflows/standalone/gambit_query.md new file mode 100644 index 000000000..b49f76083 --- /dev/null +++ b/docs/workflows/standalone/gambit_query.md @@ -0,0 +1,55 @@ +# GAMBIT_Query + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics) | PHB v2.2.0 | Yes | Sample-level | + +## GAMBIT_Query_PHB + +The GAMBIT_Query_PHB workflow performs taxon assignment of a genome assembly using the GAMBIT task. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| gambit_query | **assembly_fasta** | File | Assembly file in FASTA format | | Required | +| gambit_query | **samplename** | String | Sample name | | Required | +| gambit | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| gambit | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| gambit | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| gambit | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/gambit:1.0.0" | Optional | +| gambit | **gambit_db_genomes** | File | Database of metadata for assembled query genomes; requires complementary signatures file. If not provided, uses default database "/gambit-db" | "gs://gambit-databases-rp/2.0.0/gambit-metadata-2.0.0-20240628.gdb" | Optional | +| gambit | **gambit_db_signatures** | File | Signatures file; requires complementary genomes file. If not specified, the file from the docker container will be used. | "gs://gambit-databases-rp/2.0.0/gambit-signatures-2.0.0-20240628.gs" | Optional | + +### Workflow Tasks + +[`GAMBIT`](https://github.com/jlumpe/gambit) determines the taxon of the genome assembly using a k-mer based approach to match the assembly sequence to the closest complete genome in a database, thereby predicting its identity. Sometimes, GAMBIT can confidently designate the organism to the species level. Other times, it is more conservative and assigns it to a higher taxonomic rank. + +For additional details regarding the GAMBIT tool and a list of available GAMBIT databases for analysis, please consult the [GAMBIT](https://www.notion.so/GAMBIT-7c1376b861d0486abfbc316480046bdc?pvs=21) tool documentation. + +!!! techdetails "GAMBIT Technical Details" + + | | Links | + | --- | --- | + | Task | [task_gambit.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_gambit.wdl) | + | Software Source Code | [GAMBIT on GitHub](https://github.com/jlumpe/gambit) | + | Software Documentation | [GAMBIT ReadTheDocs](https://gambit-genomics.readthedocs.io/en/latest/) | + | Original Publication(s) | [GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0277575) | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| gambit_closest_genomes | File | CSV file listing genomes in the GAMBIT database that are most similar to the query assembly | +| gambit_db_version | String | Version of the GAMBIT database used | +| gambit_docker | String | GAMBIT Docker used | +| gambit_predicted_taxon | String | Taxon predicted by GAMBIT | +| gambit_predicted_taxon_rank | String | Taxon rank of GAMBIT taxon prediction | +| gambit_query_wf_analysis_date | String | Date of analysis | +| gambit_query_wf_version | String | PHB repository version | +| gambit_report | File | GAMBIT report in a machine-readable format | +| gambit_version | String | Version of gambit software used + +> GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification. Lumpe et al. PLOS ONE, 2022. DOI: [10.1371/journal.pone.0277575](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0277575) diff --git a/docs/workflows/standalone/kraken2.md b/docs/workflows/standalone/kraken2.md new file mode 100644 index 000000000..ffef97db3 --- /dev/null +++ b/docs/workflows/standalone/kraken2.md @@ -0,0 +1,149 @@ +# Kraken2 + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.0.0 | Yes | Sample-level | + +## Kraken2 Workflows + +**The Kraken2 workflows assess the taxonomic profile of raw sequencing data (FASTQ files).** + +Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. + +There are three Kraken2 workflows: + +- `Kraken2_PE` is compatible with **Illumina paired-end data** +- `Kraken2_SE` is compatible with **Illumina single-end data** +- `Kraken2_ONT` is compatible with **Oxford Nanopore data** + +Besides the data input types, there are minimal differences between these two workflows. + +!!! caption "Kraken2 Workflow Diagram" + ![Kraken2 Workflow Diagram](../../assets/figures/Kraken2.png) + +### Databases + +!!! info "Database selection" + The Kraken2 software is database-dependent and **taxonomic assignments are highly sensitive to the database used**. An appropriate database should contain the expected organism(s) (e.g. *Escherichia coli*) and other taxa that may be present in the reads (e.g. *Citrobacter freundii*, a common contaminant). + +#### Suggested databases + +| Database name | Database Description | Suggested Applications | GCP URI (for usage in Terra) | Source | Database Size (GB) | Date of Last Update | +| --- | --- | --- | --- | --- | --- | --- | +| **Kalamari v5.1** | Kalamari is a database of complete public assemblies, that has been fine-tuned for enteric pathogens and is backed by trusted institutions. [Full list available here ( in chromosomes.tsv and plasmids.tsv)](https://github.com/lskatz/Kalamari/tree/master/src) | Single-isolate enteric bacterial pathogen analysis (Salmonella, Escherichia, Shigella, Listeria, Campylobacter, Vibrio, Yersinia) | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2.kalamari_5.1.tar.gz`** | ‣ | 1.5 | 18/5/2022 | +| **standard 8GB** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) capped at 8GB | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_08gb_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 7.5 | 12/1/2024 | +| **standard 16GB** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) capped at 16GB | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_16gb_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 15 | 12/1/2024 | +| **standard** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 72 | 18/4/2023 | +| **viral** | RefSeq viral | Viral metagenomics | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 0.6 | 12/1/2024 | +| **EuPathDB48** | Eukaryotic pathogen genomes with contaminants removed. [Full list available here](https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113/EuPathDB48_Contents.txt) | Eukaryotic organisms (Candida spp., Aspergillus spp., etc) | **`gs://theiagen-public-files-rp/terra/theiaprok-files/k2_eupathdb48_20201113.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 30.3 | 13/11/2020 | +| **EuPathDB48** | Eukaryotic pathogen genomes with contaminants removed. [Full list available here](https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113/EuPathDB48_Contents.txt) | Eukaryotic organisms (Candida spp., Aspergillus spp., etc) | **`gs://theiagen-large-public-files-rp/terra/databases/kraken/k2_eupathdb48_20230407.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 11 | 7/4/2023 | + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | **Workflow** | +|---|---|---|---|---|---|---| +| *workflow_name | **kraken2_db** | File | A Kraken2 database in .tar.gz format | | Required | ONT, PE, SE | +| *workflow_name | **read1** | File | | | Required | ONT, PE, SE | +| *workflow_name | **read2** | File | | | Required for PE only | PE | +| *workflow_name | **samplename** | String | | | Required | ONT, PE, SE | +| kraken2_pe or kraken2_se | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | ONT, PE, SE | +| kraken2_pe or kraken2_se | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT, PE, SE | +| kraken2_pe or kraken2_se | **disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | ONT, PE, SE | +| kraken2_pe or kraken2_se | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | ONT, PE, SE | +| kraken2_pe or kraken2_se | **kraken2_args** | String | Allows a user to supply additional kraken2 command-line arguments | | Optional | ONT, PE, SE | +| kraken2_pe or kraken2_se | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT, PE, SE | +| kraken2_pe or kraken2_se | **unclassified_out** | String | Allows user to rename unclassified FASTQ files output. Must include .fastq as the suffix | unclassified#.fastq | Optional | ONT, PE, SE | +| krona | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | PE, SE | +| krona | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | +| krona | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/krona:2.7.1--pl526_5 | Optional | PE, SE | +| krona | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | +| kraken2_recalculate_abundances | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| kraken2_recalculate_abundances | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| kraken2_recalculate_abundances | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-28-v4 | Optional | ONT | +| kraken2_recalculate_abundances | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | ONT | +| kraken2_recalculate_abundances | **target_organism** | String | Target organism for the kraken2 abundance to be exported to the data table | | Optional | ONT | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | ONT, PE, SE | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | ONT, PE, SE | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| kraken2_classified_read1 | File | FASTQ file of classified forward/R1 reads | +| kraken2_classified_read2 | File | FASTQ file of classified reverse/R2 reads (if PE) | +| kraken2_classified_report | File | Standard Kraken2 output report. TXT filetype, but can be opened in Excel as a TSV file | +| kraken2_docker | String | Docker image used to run kraken2 | +| kraken2_*_wf_analysis_date | String | Date the workflow was run | +| kraken2_*_wf_version | String | Workflow version | +| kraken2_report | File | TXT document describing taxonomic prediction of every FASTQ record. This file is usually very large and cumbersome to open and view | +| kraken2_unclassified_read1 | File | FASTQ file of unclassified forward/R1 reads | +| kraken2_unclassified_read2 | File | FASTQ file of unclassified reverse/R2 reads (if PE) | +| kraken2_version | String | kraken2 version | +| krona_docker | String | Docker image used to run krona (if PE or SE) | +| krona_html | File | HTML report of krona with visualisation of taxonomic classification of reads (if PE or SE) | +| krona_version | String | krona version (if PE or SE) | + +#### Interpretation of results + +The most important outputs of the Kraken2 workflows are the `kraken2_report` files. These will include a breakdown of the number of sequences assigned to a particular taxon, and the percentage of reads assigned. [A complete description of the report format can be found here](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#standard-kraken-output-format). + +When assessing the taxonomic identity of a single isolate's sequence, it is normal that a few reads are assigned to very closely rated taxa due to the shared sequence identity between them. "Very closely related taxa" may be genetically similar species in the same genus, or taxa with which the dominant species have undergone horizontal gene transfer. Unrelated taxa or a high abundance of these closely related taxa is indicative of contamination or sequencing of non-target taxa. Interpretation of the results is dependent on the biological context. + +??? toggle "Example Kraken2 report" + Below is an example `kraken2_report` for a _Klebsiella pneumoniae_ sample. Only the first 30 lines are included here since rows near the bottom are often spurious results with only a few reads assigned to a non-target organism. + + From this report, we can see that 84.35 % of the reads were assigned at the species level (`S` in the 4th column) to "_Klebsiella pneumoniae_". Given almost 6 % of reads were "unclassified" and ~2 % of reads were assigned to very closely related taxa (in the _Klebsiella_ genus), this suggests the reads are from _Klebsiella pneumoniae_ with very little -if any- read contamination. + + ``` + 5.98 108155 108155 U 0 unclassified + 94.02 1699669 0 C 1 + 94.02 1699669 1862 C1 131567 cellular organisms + 93.91 1697788 2590 D 2 Bacteria + 93.75 1694805 6312 P 1224 Proteobacteria + 93.39 1688284 37464 C 1236 Gammaproteobacteria + 91.31 1650648 35278 O 91347 Enterobacterales + 89.31 1614639 43698 F 543 Enterobacteriaceae + 86.40 1561902 22513 G 570 Klebsiella + **84.35 1524918 1524918 S 573 Klebsiella pneumoniae** + 0.75 13596 13596 S 548 Klebsiella aerogenes + 0.03 600 600 S 244366 Klebsiella variicola + 0.01 253 253 S 571 Klebsiella oxytoca + 0.00 17 17 S 1134687 Klebsiella michiganensis + 0.00 3 0 G1 2608929 unclassified Klebsiella + 0.00 3 3 S 1972757 Klebsiella sp. PO552 + 0.00 2 2 S 1463165 Klebsiella quasipneumoniae + 0.17 3035 129 G 590 Salmonella + 0.15 2728 909 S 28901 Salmonella enterica + 0.03 582 582 S1 9000010 Salmonella enterica subsp. IIa + 0.02 306 306 S1 59201 Salmonella enterica subsp. enterica + 0.01 230 230 S1 9000014 Salmonella enterica subsp. IIIa + 0.01 221 221 S1 9000015 Salmonella enterica subsp. IIIb + 0.01 136 136 S1 9000016 Salmonella enterica subsp. IX + 0.01 132 132 S1 9000011 Salmonella enterica subsp. IIb + 0.01 122 122 S1 59208 Salmonella enterica subsp. VII + 0.00 41 41 S1 59207 Salmonella enterica subsp. indica + 0.00 25 25 S1 9000017 Salmonella enterica subsp. X + 0.00 24 24 S1 9000009 Salmonella enterica subsp. VIII + 0.01 178 178 S 54736 Salmonella bongori + ``` + +#### Krona visualisation of Kraken2 report + +[Krona](https://github.com/marbl/Krona) produces an interactive report that allows hierarchical data, such as the one from Kraken2, to be explored with zooming, multi-layered pie charts. These pie charts are intuitive and highly responsive. + +Krona will only output hierarchical results for bacterial organisms in its current implementation. + +??? toggle "Example Krona report" + + Below is an example of the `krona_html` for a metagenomic sample. Taxonomic rank is organised from the centre of the pie chart to the edge, with each slice representing the relative abundance of a given taxa in the sample. + + ![Example Krona Report](../../assets/figures/example_krona_report.png) + +!!! techdetails "Kraken2 Technical Details" + | | Links | + | --- | --- | + | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | + | Software Documentation | | + | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://link.springer.com/article/10.1186/s13059-019-1891-0) | \ No newline at end of file diff --git a/docs/workflows/standalone/ncbi_amrfinderplus.md b/docs/workflows/standalone/ncbi_amrfinderplus.md new file mode 100644 index 000000000..237e79039 --- /dev/null +++ b/docs/workflows/standalone/ncbi_amrfinderplus.md @@ -0,0 +1,61 @@ +# NCBI-AMRFinderPlus + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics) | PHB v2.2.0 | Yes | Sample-level | + +## NCBIAMRFinderPlus_PHB + +AMRFinderPlus identifies acquired antimicrobial resistance (AMR) genes, virulence genes, and stress genes. Such AMR genes confer resistance to antibiotics, metals, biocides, heat, or acid. For some taxa (see [here](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option)), AMRFinderPlus will provide taxa-specific results including filtering out genes that are almost ubiquitous in the taxa (intrinsic genes) and identifying resistance-associated point mutations. In TheiaProk, the taxon used by AMRFinderPlus is specified based on the `gambit_predicted_taxon` or a user-provided `expected_taxon`. + +You can check if a gene or point mutation is in the AMRFinderPlus database [here](https://www.ncbi.nlm.nih.gov/pathogens/refgene/#), find the sequences of reference genes [here](https://www.ncbi.nlm.nih.gov/bioproject/PRJNA313047), and search the query Hidden Markov Models (HMMs) used by AMRFinderPlus to identify AMR genes and some stress and virulence proteins ([here](https://www.ncbi.nlm.nih.gov/pathogens/hmm/)). The AMRFinderPlus database is updated frequently. You can ensure you are using the most up-to-date version by specifying the docker image as a workflow input. You might like to save this docker image as a workspace data element to make this easier. + +### 📋 Use Cases + +- To run ONLY AMRFinderPlus software instead of running the entire TheiaProk workflow. This workflow will run much faster than the TheiaProk workflows. +- To update AMRFinderPlus results when a new version of the software and/or its database are released by the NCBI developers. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| amrfinderplus_wf | **assembly** | File | Genome assembly file in FASTA format. Can be generated by TheiaProk workflow or other bioinformatics workflows. | | Required | +| amrfinderplus_wf | **samplename** | String | Name of the sample to be analyzed | | Required | +| amrfinderplus_nuc | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| amrfinderplus_nuc | **detailed_drug_class** | Boolean | | False | Optional | +| amrfinderplus_nuc | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| amrfinderplus_nuc | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/ncbi-amrfinderplus:3.11.20-2023-09-26.1" | Optional | +| amrfinderplus_nuc | **hide_point_mutations** | Boolean | If set to true, the output File amrfinderplus_all_report will not include any POINT mutations identified by AMRFinderPlus. | False | Optional | +| amrfinderplus_nuc | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| amrfinderplus_nuc | **mincov** | Float | "Minimum proportion of reference gene covered for a BLAST-based hit (Methods BLAST or PARTIAL)." Attribute should be a float ranging from 0-1, such as 0.6 (equal to 60% coverage) | 0.5 | Optional | +| amrfinderplus_nuc | **minid** | Float | "Minimum identity for a blast-based hit hit (Methods BLAST or PARTIAL). -1 means use a curated threshold if it exists and 0.9 otherwise. Setting this value to something other than -1 will override any curated similarity cutoffs." Attribute should be a float ranging from 0-1, such as 0.95 (equal to 95% identity) | 0.90 | Optional | +| amrfinderplus_nuc | **organism** | String | If provided, this input will override the taxonomic assignment made by GAMBIT and launch the relevant taxon-specific submodules. It will also modify the organism flag used by AMRFinderPlus. Example format: "Salmonella enterica" | | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| amrfinderplus_all_report | File | Output TSV file from AMRFinderPlus (described [here](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#fields)) | +| amrfinderplus_amr_classes | String | AMRFinderPlus predictions for classes of drugs that genes found in the reads are known to confer resistance to | +| amrfinderplus_amr_core_genes | String | AMR genes identified by AMRFinderPlus where the scope is "core" | +| amrfinderplus_amr_plus_genes | String | AMR genes identified by AMRFinderPlus where the scope is "plus" | +| amrfinderplus_amr_report | File | TSV file detailing AMR genes only, from the amrfinderplus_all_report | +| amrfinderplus_amr_subclasses | String | More specificity about the drugs that genes identified in the reads confer resistance to | +| amrfinderplus_db_version | String | AMRFinderPlus database version used | +| amrfinderplus_stress_genes | String | Stress genes identified by AMRFinderPlus | +| amrfinderplus_stress_report | File | TSV file detailing stress genes only, from the amrfinderplus_all_report | +| amrfinderplus_version | String | AMRFinderPlus software version used | +| amrfinderplus_virulence_genes | String | Virulence genes identified by AMRFinderPlus | +| amrfinderplus_virulence_report | File | TSV file detailing virulence genes only, from the amrfinderplus_all_report | +| amrfinderplus_wf_analysis_date | String | Date of analysis | +| amrfinderplus_wf_version | String | Version of PHB used for the analysis | + +## References + +>Feldgarden M, Brover V, Gonzalez-Escalona N, Frye JG, Haendiges J, Haft DH, Hoffmann M, Pettengill JB, Prasad AB, Tillman GE, Tyson GH, Klimke W. AMRFinderPlus and the Reference Gene Catalog facilitate examination of the genomic links among antimicrobial resistance, stress response, and virulence. Sci Rep. 2021 Jun 16;11(1):12728. doi: 10.1038/s41598-021-91456-0. PMID: 34135355; PMCID: PMC8208984. + +> \ No newline at end of file diff --git a/docs/workflows/standalone/rasusa.md b/docs/workflows/standalone/rasusa.md new file mode 100644 index 000000000..055b235f5 --- /dev/null +++ b/docs/workflows/standalone/rasusa.md @@ -0,0 +1,68 @@ +# RASUSA + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.0.0 | Yes | Sample-level | + +## RASUSA_PHB + +RASUSA functions to randomly downsample the number of raw reads to a user-defined threshold. + +### 📋 Use Cases + +- to reduce computing resources when samples end up with drastically more data than needed to perform analyses +- to perform limit of detection (LOD) studies to identify appropriate minimum coverage thresholds required to perform downstream analyses + +### 🔧 Desired size may be specified by inputting any one of the following + +- coverage (e.g. 20X) +- number of bases (e.g. "5m" for 5 megabases) +- number of reads (e.g. 100000 total reads) +- fraction of reads (e.g. 0.5 samples half the reads) + +!!! info "Call-caching disabled" + If using RASUSA_PHB workflow version v2.0.0 or higher, **the call-caching feature of Terra has been DISABLED to ensure that the workflow is run from the beginning and data is downloaded fresh.** Call-caching will not be enabled, even if the user checks the box ✅ in the Terra workflow interface. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Attribute** | **Terra Status** | +|---|---|---|---|---|---| +| rasusa_workflow | **coverage** | Float | Use to specify the desired coverage of reads after downsampling; actual coverage of subsampled reads will not be exact and may be slightly higher; always check the estimated clean coverage after performing downstream workflows to verify coverage values, when necessary | | Required | +| rasusa_workflow | **genome_length** | String | Input the approximate genome size expected in quotations; this is used to determine the number of bases required to achieve the desired coverage; acceptable metric suffixes include: `b`, `k`, `m`, `g`, and `t` for base, kilo, mega, giga, and tera, respectively | | Required | +| rasusa_workflow | **read1** | File | FASTQ file containing read1 sequences | | Required | +| rasusa_workflow | **read2** | File | FASTQ file containing read2 sequences | | Required | +| rasusa_workflow | **samplename** | String | Name of the sample to be analyzed | | Required | +| rasusa_task | **bases** | String | Explicitly define the number of bases required in the downsampled reads in quotations; when used, genome size and coverage are ignored; acceptable metric suffixes include: `b`, `k`, `m`, `g`, and `t` for base, kilo, mega, giga, and tera, respectively | | Optional | +| rasusa_task | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| rasusa_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| rasusa_task | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/rasusa:0.7.0" | Optional | +| rasusa_task | **frac** | Float | Explicitly define the fraction of reads to keep in the subsample; when used, genome size and coverage are ignored; acceptable inputs include whole numbers and decimals, e.g. 50.0 will leave 50% of the reads in the subsample | | Optional | +| rasusa_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| rasusa_task | **num** | Int | Optional: explicitly define the number of reads in the subsample; when used, genome size and coverage are ignored; acceptable metric suffixes include: `b`, `k`, `m`, `g`, and `t` for base, kilo, mega, giga, and tera, respectively | | Optional | +| rasusa_task | **seed** | Int | Use to assign a name to the "random seed" that is used by the subsampler; i.e. this allows the exact same subsample to be produced from the same input file/s in subsequent runs when providing the seed identifier; do not input values for random downsampling | | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| rasusa_version | String | Version of RASUSA used for the analysis | +| rasusa_wf_analysis_date | String | Date of analysis | +| rasusa_wf_version | String | Version of PHB used for the analysis | +| read1_subsampled | File | New read1 FASTQ files downsampled to desired coverage | +| read2_subsampled | File | New read2 FASTQ files downsampled to desired coverage | + +!!! tip "Don't Forget!" + Remember to use the subsampled reads in downstream analyses with `this.read1_subsampled` and `this.read2_subsampled` inputs. + +!!! info "Verify" + Confirm reads were successfully subsampled before downstream analyses by comparing read file size/s to the original read file size/s + + _View file sizes by clicking on the read file listed in the Terra data table and looking at the file size_ + +## References + +> Hall, M. B., (2022). Rasusa: Randomly subsample sequencing reads to a specified coverage. Journal of Open Source Software, 7(69), 3941,  diff --git a/docs/workflows/standalone/rename_fastq.md b/docs/workflows/standalone/rename_fastq.md new file mode 100644 index 000000000..19ec4b4a6 --- /dev/null +++ b/docs/workflows/standalone/rename_fastq.md @@ -0,0 +1,36 @@ +# Rename_FASTQ + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.1.0 | Yes | Sample-level | + +## Rename_FASTQ_PHB + +This sample-level workflow receives a read file or a pair of read files (FASTQ), compressed or uncompressed, and returns a new, renamed and compressed FASTQ file. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| rename_fastq_files | **new_filename** | String | New name for the FASTQ file(s) | | Required | +| rename_fastq_files | **read1** | File | FASTQ file containing read1 sequences | | Required | +| rename_fastq_files | **read2** | File | FASTQ file containing read2 sequences | | Optional | +| rename_PE_files or rename_SE_files | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| rename_PE_files or rename_SE_files | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| rename_PE_files or rename_SE_files | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/ubuntu/ubuntu:jammy-20230816" | Optional | +| rename_PE_files or rename_SE_files | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Outputs + +If a reverse read (`read2`) is provided, the files get renamed to the provided `new_filename` input with the notation `_R1.fastq.gz` and `_R2.fastq.gz`. If only `read1` is provided, the file is renamed to `.fastq.gz`. + +| **Variable** | **Type** | **Description** | +|---|---|---| +| read1_renamed | File | New read1 FASTQ file renamed to desired filename | +| read2_renamed | File | New read2 FASTQ file renamed to desired filename | +| rename_fastq_files_analysis_date | String | Date of analysis | +| rename_fastq_files_version | String | Version of PHB used for the analysis | diff --git a/docs/workflows/standalone/tbprofiler_tngs.md b/docs/workflows/standalone/tbprofiler_tngs.md new file mode 100644 index 000000000..3e505cabb --- /dev/null +++ b/docs/workflows/standalone/tbprofiler_tngs.md @@ -0,0 +1,97 @@ +# TBProfiler_tNGS + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.0.0 | Yes | Sample-level | + +## TBProfiler_tNGS_PHB + +This workflow is still in experimental research stages. Documentation is minimal as changes may occur in the code; it will be fleshed out when a stable state has been achieved. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| tbprofiler_tngs | **read1** | File | Illumina forward read file in FASTQ file format (compression optional) | | Required | +| tbprofiler_tngs | **read2** | File | Illumina reverse read file in FASTQ file format (compression optional) | | Required | +| tbprofiler_tngs | **samplename** | String | Name of sample to be analyzed | | Required | +| tbp_parser | **coverage_regions_bed** | File | A file that contains the regions to perform coverage analysis on | | Optional | +| tbp_parser | **coverage_threshold** | Int | The minimum percentage of a region to exceed the minimum depth for a region to pass QC in tbp_parser | 100 | Optional | +| tbp_parser | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| tbp_parser | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| tbp_parser | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.6.0 | Optional | +| tbp_parser | **etha237_frequency** | Float | Minimum frequency for a mutation in ethA at protein position 237 to pass QC in tbp-parser | 0.1 | Optional | +| tbp_parser | **expert_rule_regions_bed** | File | A file that contains the regions where R mutations and expert rules are applied | | Optional | +| tbp_parser | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| tbp_parser | **min_depth** | Int | Minimum depth for a variant to pass QC in tbp_parser | 10 | Optional | +| tbp_parser | **min_frequency** | Float | Minimum allele frequency for a variant to pass QC in tbp-parser | 0.1 | Optional | +| tbp_parser | **min_read_support** | Int | Minimum read support for a variant to pass QC in tbp-parser | 10 | Optional | +| tbp_parser | **operator** | String | Fills the "operator" field in the tbp_parser output files | | Optional | +| tbp_parser | **rpob449_frequency** | Float | Minimum frequency for a mutation at protein position 449 to pass QC in tbp-parser | 0.1 | Optional | +| tbp_parser | **rrl_frequency** | Float | Minimum frequency for a mutation in rrl to pass QC in tbp-parser | 0.1 | Optional | +| tbp_parser | **rrl_read_support** | Int | Minimum read support for a mutation in rrl to pass QC in tbp-parser | 10 | Optional | +| tbp_parser | **rrs_frequency** | Float | Minimum frequency for a mutation in rrs to pass QC in tbp-parser | 0.1 | Optional | +| tbp_parser | **rrs_read_support** | Int | Minimum read support for a mutation in rrs to pass QC in tbp-parser | 10 | Optional | +| tbp_parser | **sequencing_method** | String | Fills out the "seq_method" field in the tbp_parser output files | | Optional | +| tbp_parser | **tbp_parser_debug** | Boolean | Activate the debug mode on tbp_parser; increases logging outputs | FALSE | Optional | +| tbprofiler | **cov_frac_threshold** | Int | A cutoff used to calculate the fraction of the region covered by ≤ this value | 1 | Optional | +| tbprofiler | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | +| tbprofiler | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| tbprofiler | **mapper** | String | The mapping tool used in TBProfiler to align the reads to the reference genome; see TBProfiler's original documentation for available options. | bwa | Optional | +| tbprofiler | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| tbprofiler | **min_af** | Float | The minimum allele frequency to call a variant | 0.1 | Optional | +| tbprofiler | **min_af_pred** | Float | The minimum allele frequency to use a variant for resistance prediction | 0.1 | Optional | +| tbprofiler | **min_depth** | Int | The minimum depth for a variant to be called. | 10 | Optional | +| tbprofiler | **ont_data** | Boolean | Internal component; do not modify | | Do not modify, Optional | +| tbprofiler | **tbprofiler_custom_db** | File | TBProfiler uses by default the TBDB database; if you have a custom database you wish to use, you must provide a custom database in this field and set tbprofiler_run_custom_db to true | | Optional | +| tbprofiler | **tbprofiler_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/tbprofiler:4.4.2 | Optional | +| tbprofiler | **tbprofiler_run_custom_db** | Boolean | | FALSE | Optional | +| tbprofiler | **variant_caller** | String | Select a different variant caller for TBProfiler to use by writing it in this block; see TBProfiler's original documentation for available options. | freebayes | Optional | +| tbprofiler | **variant_calling_params** | String | Enter additional variant calling parameters in this free text input to customize how the variant caller works in TBProfiler | | Optional | +| tbprofiler | **bases_to_crop** | Int | Indicate the number of bases to remove from the start and end of the read | 30 | Optional | +| trimmomatic_pe | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| trimmomatic_pe | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| trimmomatic_pe | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/trimmomatic:0.39 | Optional | +| trimmomatic_pe | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| trimmomatic_pe | **trimmomatic_args** | String | Additional arguments to pass to trimmomatic. "-phred33" specifies the Phred Q score encoding which is almost always phred33 with modern sequence data. | -phred33 | Optional | +| trimmomatic_pe | **trimmomatic_min_length** | Int | Specifies minimum length of each read after trimming to be kept | 75 | Optional | +| trimmomatic_pe | **trimmomatic_quality_trim_score** | Int | The trimming quality score | 30 | Optional | +| trimmomatic_pe | **trimmomatic_window_size** | Int | The window size for trimming | 4 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Terra Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| tbp_parser_average_genome_depth | Float | The mean depth of coverage across all target regions included in the analysis | +| tbp_parser_coverage_report | File | A file containing the breadth of coverage across each target loci | +| tbp_parser_docker | String | The docker image and version tag for the tbp_parser tool | +| tbp_parser_genome_percent_coverage | Float | The percent breadth of coverage across the entire genome | +| tbp_parser_laboratorian_report_csv | File | An output file containing information regarding each mutation and its associated drug resistance profile in a CSV file. This file also contains two interpretation fields -- "Looker" and "MDL" which are generated using the CDC's expert rules for interpreting the severity of potential drug resistance mutations. | +| tbp_parser_lims_report_csv | File | An output file formatted specifically for STAR LIMS. This CSV report summarizes the highest severity mutations for each antimicrobial and lists the relevant mutations for each gene. | +| tbp_parser_looker_report_csv | File | An output file that contains condensed information suitable for generating a dashboard in Google's Looker studio. | +| tbp_parser_version | String | The version number of tbp_parser | +| tbprofiler_dr_type | String | The drug resistance category as determined by TBProfiler | +| tbprofiler_main_lineage | String | The Mycobacterium tuberculosis lineage assignment as made by TBProfiler | +| tbprofiler_median_coverage | Int | The median depth of coverage across the target loci | +| tbprofiler_num_dr_variants | String | The total number of drug resistance conferring variants detected by TBProfiler | +| tbprofiler_num_other_variants | String | The total number of non-drug resistance conferring variants detected by TBProfiler | +| tbprofiler_output_alignment_bai | File | The index file associated with the binary alignment map of the input reads against the H37Rv genome | +| tbprofiler_output_alignment_bam | File | The binary alignment map of the input reads against the H37Rv genome | +| tbprofiler_pct_reads_mapped | Float | The percentage of reads that successfully mapped to the H37Rv genome | +| tbprofiler_report_csv | File | The raw output file from TBProfiler | +| tbprofiler_report_json | File | The json output file from TBProfiler | +| tbprofiler_report_tsv | File | The TSV output file from TBProfiler | +| tbprofiler_resistance_genes | String | The genes in which a mutation was detected that may be resistance conferring | +| tbprofiler_sub_lineage | String | The Mycobacterium tuberculosis sub-lineage assignment as made by TBProfiler | +| tbprofiler_tngs_wf_analysis_date | String | The date on which the workflow was run | +| tbprofiler_tngs_wf_version | String | The version of the tbprofiler_tngs workflow used for this analysis | +| tbprofiler_version | String | The version of TBProfiler used for this analysis | +| trimmomatic_docker | String | The docker image used for the trimmomatic module in this workflow | +| trimmomatic_read1_trimmed | File | The read1 file post trimming | +| trimmomatic_read2_trimmed | File | The read2 file post trimming | +| trimmomatic_stats | File | The read trimming statistics | +| trimmomatic_version | String | The version of trimmomatic used in this analysis | diff --git a/docs/workflows/standalone/theiavalidate.md b/docs/workflows/standalone/theiavalidate.md new file mode 100644 index 000000000..9aee82b32 --- /dev/null +++ b/docs/workflows/standalone/theiavalidate.md @@ -0,0 +1,158 @@ +# TheiaValidate + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.0.0 | No | | + +## TheiaValidate_PHB + +!!! caption "TheiaValidate Workflow Diagram" + ![TheiaValidate Workflow Diagram](../../assets/figures/TheiaValidate.png) + +TheiaValidate performs basic comparisons between user-designated columns in two separate tables. We anticipate this workflow being run to determine if any differences exist between version releases or two workflows, such as TheiaProk_ONT vs TheiaProk_Illumina_PE. A summary PDF report is produced in addition to a Excel spreadsheet that lists the values for any columns that do not have matching content for a sample. + +!!! warning + The two tables being compared **must** have both identical sample names and an equal number of samples. If not, validation will not work or (in the case of unequal number of samples) not be attempted. + +In order to enable this workflow to function for different workflow series, we require users to provide a list of columns they want to compare between the two tables. Feel free to use the information below that Theiagen uses to compare versions of the three main workflow series as a _**starting point**_ for your own validations: + +!!! tool "Validation Starting Points" + | Workflow Series | Validation Criteria TSV | Columns to Compare | + |---|---|---| + | TheiaCoV Workflows | [TheiaCov Validation Criteria](../../assets/files/theiavalidate/theiacov-validation-criteria.txt) | abricate_flu_subtype,abricate_flu_type,assembly_length_unambiguous,assembly_mean_coverage,irma_subtype,irma_type,kraken_human,kraken_human_dehosted,kraken_sc2,kraken_sc2_dehosted,kraken_target_org,kraken_target_org_dehosted,nextclade_aa_dels,nextclade_aa_subs,nextclade_clade,nextclade_lineage,nextclade_tamiflu_resistance_aa_subs,num_reads_clean1,num_reads_clean2,number_N,pango_lineage,percent_reference_coverage,vadr_num_alerts | + | TheiaEuk Workflows | [TheiaEuk Validation Criteria](../../assets/files/theiavalidate/theiaeuk-validation-criteria.txt) | assembly_length,busco_results,clade_type,est_coverage_clean,est_coverage_raw,gambit_predicted_taxon,n50_value,num_reads_clean1,num_reads_clean2,number_contigs,quast_gc_percent,theiaeuk_snippy_variants_hits | + | TheiaProk Workflows | [TheiaProk Validation Criteria](../../assets/files/theiavalidate/theiaprok-validation-criteria.txt) | abricate_abaum_plasmid_type_genes,agrvate_agr_group,amrfinderplus_amr_core_genes,amrfinderplus_amr_plus_genes,amrfinderplus_stress_genes,amrfinderplus_virulence_genes,ani_highest_percent,ani_top_species_match,assembly_length,busco_results,ectyper_predicted_serotype,emmtypingtool_emm_type,est_coverage_clean,est_coverage_raw,gambit_predicted_taxon,genotyphi_final_genotype,hicap_genes,hicap_serotype,kaptive_k_type,kleborate_genomic_resistance_mutations,kleborate_key_resistance_genes,kleborate_mlst_sequence_type,legsta_predicted_sbt,lissero_serotype,meningotype_serogroup,midas_primary_genus,midas_secondary_genus,midas_secondary_genus_abundance,n50_value,ngmaster_ngmast_sequence_type,ngmaster_ngstar_sequence_type,num_reads_clean1,num_reads_clean2,number_contigs,pasty_serogroup,pbptyper_predicted_1A_2B_2X,plasmidfinder_plasmids,poppunk_gps_cluster,seqsero2_predicted_serotype,seroba_ariba_serotype,seroba_serotype,serotypefinder_serotype,shigatyper_ipaB_presence_absence,shigatyper_predicted_serotype,shigeifinder_cluster,shigeifinder_serotype,sistr_predicted_serotype,sonneityping_final_genotype,spatyper_type,srst2_vibrio_serogroup,staphopiasccmec_types_and_mecA_presence,tbprofiler_main_lineage,tbprofiler_resistance_genes,ts_mlst_predicted_st,virulencefinder_hits | + +If additional validation metrics are desired, the user has the ability to provide a `validation_criteria_tsv` file that specifies what type of comparison should be performed. There are several options for additional validation checks: + +- **EXACT** performs an exact string match and counts the number of exact match failures/differences +- **IGNORE** does not check the values and says there are 0 failures +- **SET** checks list items (such as `amrfinder_plus_genes` which is a comma-delimited list of genes) for identical content — order does not matter; that is, `mdsA,mdsB` is determined to be same as `mdsB,mdsA`. The EXACT match does not consider these to be the same, but the SET match does. +-****, which is an actual decimal value such as **0.02**, calculates the percent difference between _numerical_ columns. If the columns are not numerical, this function will **not** work and will lead to workflow failure. For example, if the decimal percentage is 0.02, the test will indicate a failure if the values in the two columns are more than 2% different. +- Dates, integers, and object-type values are ignored and indicate 0 failures. + +### File Comparisons + +If a column consists of only GCURIs (Google Cloud file paths), the files will be localized and compared with either an EXACT match or a SET match. In the SET match, the lines in the file are ordered before comparison. Results are returned to the summary table as expected. The results of each file comparison can be found in the `theiavalidate_diffs` output column. + +### Inputs + +Please note that all string inputs **must** be enclosed in quotation marks; for example, "column1,column2" or "workspace1" + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| theiavalidate | **columns_to_compare** | String | A comma-separated list of the columns the user wants to compare. Do not include whitespace. | | Required | +| theiavalidate | **output_prefix** | String | The prefix for the output files | | Required | +| theiavalidate | **table1_name** | String | The name of the first table | | Required | +| theiavalidate | **table2_name** | String | The name of the second table | | Required | +| theiavalidate | **terra_project1_name** | String | The name of the Terra project where table1_name can be found | | Required | +| theiavalidate | **terra_workspace1_name** | String | The name of the Terra workspace where table1_name can be found | | Required | +| theiavalidate | **column_translation_tsv** | File | If the user wants to link two columns of different names, they may supply a TSV file that provides a "column translation" between the two files (see the section below this table). | | Optional | +| theiavalidate | **terra_project2_name** | String | If the table2_name is located in a different Terra project, indicate it here. Otherwise, the workflow will look for table2_name in the Terra project indicated in terra_project1_name. | value for `terra_project1_name` | Optional | +| theiavalidate | **terra_workspace2_name** | String | If the table2_name is located in a different Terra workspace, indicate it here. Otherwise, the workflow will look for table2_name in the Terra workspace indicated in terra_workspace1_name. | value for `terra_workspace1_name` | Optional | +| theiavalidate | **validation_criteria_tsv** | File | If the user wants to specify a different comparison than the default exact string match, they may supply a TSV file that indicates the different options (see the section below this table). | | Optional | +| compare_two_tsvs | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| compare_two_tsvs | **debug_output** | Boolean | Set to true to enable more outputs; useful when debugging | FALSE | Optional | +| compare_two_tsvs | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| compare_two_tsvs | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/theiavalidate:0.1.0 | Optional | +| compare_two_tsvs | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| compare_two_tsvs | **na_values** | String | If the user knows a particular value in either table that they would like to be considered N/A, they can indicate those values in a comma-separated list here. Any changes here will overwrite the default and not append to the default list. Do not include whitespace. | -1.#IND,1.#QNAN,1.#IND,-1.#QNAN,#N/A,N/A,n/a,,#NA,NULL,null,NaN,-NaN,nan,-nan,None | Optional | +| export_two_tsvs | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| export_two_tsvs | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 10 | Optional | + +The optional `validation_criteria_tsv` file takes the following format (tab-delimited; _a header line is required_): + +```text linenums="1" +column_name criteria +columnB SET +columnC IGNORE +columnD 0.01 +columnE EXACT +``` + +Please see above for a description of all available criteria options (EXACT, IGNORE, SET, ). + +The optional `column_translation_tsv` file takes the following format (tab-delimited; _there can be **no** header line_): + +```text linenums="1" +column_name_in_table1 column_name_in_table2 +column_name_in_table2 column_name_in_table1 +internal_column_name display_column_name +``` + +Please note that the name in the **second column** will be displayed and used in all output files. + +!!! warning "Known Bug" + There must be _**more**_ than one line in the `column_translation_tsv` file or else this error will appear: `AttributeError: 'str' object has no attribute 'to_dict'`. To fix this error, add an additional line in the `column_translation_tsv` file, like the following: `columnA columnA` + +!!! warning "Known Bug" + If performing a comparison, all samples must have values for that column. + +!!! info "Call Caching Disabled" + If using TheiaValidate workflow version 1.3.0 or higher, the call-caching feature of Terra has been DISABLED to ensure that the workflow is run from the beginning and data is compared fresh. Call-caching will not be enabled, even if the user checks the box ✅ in the Terra workflow interface. + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| theiavalidate_criteria_differences | File | A TSV file that lists only the differences that fail to meet the validation criteria | +| theiavalidate_date | String | The date the analysis was run | +| theiavalidate_diffs | Array[File] | An array of files with a single file for each file comparison performed; only has values if a column with files is compared | +| theiavalidate_exact_differences | File | A TSV file that lists all exact string match differences between samples | +| theiavalidate_filtered_input_table1 | File | The first data table used for validation after removing unexamined columns and translating column names | +| theiavalidate_filtered_input_table2 | File | The second data table used for validation after removing unexamined columns and translating column names | +| theiavalidate_report | File | A PDF summary report | +| theiavalidate_status | String | Indicates whether or not validation was attempted | +| theiavalidate_version | String | The version of the TheiaValidate Python Docker | +| theiavalidate_wf_version | String | The version of the PHB repository | + +### Example Data and Outputs + +To help demonstrate how TheiaValidate works, please observe the following example and outputs: + +???+ toggle "Table1" + | entity:example_table1_id | columnA-string | columnB-set | columnC-ignore | columnD-float | columnE-missing | + | --- | --- | --- | --- | --- | --- | + | sample1 | option1 | item1,item2,item3 | cheese | 1000 | present | + | sample2 | option1 | item1,item3,item2 | cheesecake | 12 | present | + | sample3 | option2 | item1,item2,item3 | cake | 14 | present | + | sample4 | option1 | item2,item1 | cakebatter | 3492 | | + | sample5 | option2 | item1,item2 | batter | 3 | present | + +???+ toggle "Table2" + | entity:example_table2_id | columnA-string | columnB-set | columnC-ignore | columnD-float | missing | + | --- | --- | --- | --- | --- | --- | + | sample1 | option1 | item1,item3,item2 | cheesecake | 999 | present | + | sample2 | option2 | item1,item2,item3 | batter | 12 | present | + | sample3 | option1 | item1,item2 | cheese | 24 | | + | sample4 | option1 | item1,item2 | cakebatter | 728 | | + | sample5 | option2 | item1,item2,item3 | batter | 4 | present | + +???+ toggle "Validation Criteria" + | column | criteria | + | --- | --- | + | columnB-set | SET | + | columnC-ignore | IGNORE | + | columnD-float | 0.01 | + | columnE-missing | EXACT | + +???+ toggle "Column Translation" + | missing | columnE-missing | + | --- | --- | + | columnA-string | columnA-string | + + _Note: the second row translating_ `columnA-string` _to itself is included to prevent the known bug explained above._ + +If the above inputs are provided, then the following output files will be generated: + +[filtered_example_table1.tsv](../../assets/files/theiavalidate/filtered_example_table1.tsv) + +[filtered_example_table2.tsv](../../assets/files/theiavalidate/filtered_example_table2.tsv) + +[example_summary.pdf](../../assets/files/theiavalidate/example_summary.pdf) + +[example_exact_differences.tsv](../../assets/files/theiavalidate/example_exact_differences.tsv) + +[example_validation_criteria_differences.tsv](../../assets/files/theiavalidate/example_validation_criteria_differences.tsv) \ No newline at end of file diff --git a/docs/workflows_overview/workflows_alphabetically.md b/docs/workflows_overview/workflows_alphabetically.md new file mode 100644 index 000000000..0111bab8b --- /dev/null +++ b/docs/workflows_overview/workflows_alphabetically.md @@ -0,0 +1,65 @@ +--- +title: Alphabetical Workflows +--- + +[Sort by Workflow Type](workflows_type.md) | [Sort by Kingdom](workflows_kingdom.md) + +--- + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Assembly_Fetch**](../workflows/data_import/assembly_fetch.md) | Download assemblies from NCBI, after optionally identifying the closest RefSeq reference genome to your own draft assembly | Any taxa | Sample-level | Yes | v1.3.0 | [Assembly_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Assembly_Fetch_PHB:main?tab=info) | +| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.1.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | +| [**BaseSpace_Fetch**](../workflows/data_import/basespace_fetch.md)| Import data from BaseSpace into Terra | Any taxa | Sample-level | Yes | v2.0.0 | [BaseSpace_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/BaseSpace_Fetch_PHB:main?tab=info) | +| [**Cauris_CladeTyper**](../workflows/standalone/cauris_cladetyper.md)| C. auris clade assignment | Mycotics | Sample-level | Yes | v1.0.0 | [Cauris_CladeTyper_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Cauris_CladeTyper_PHB:main?tab=info) | +| [**Concatenate_Column_Content**](../workflows/data_export/concatenate_column_content.md) | Concatenate contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Concatenate_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Column_Content_PHB:main?tab=info) | +| [**Core_Gene_SNP**](../workflows/phylogenetic_construction/core_gene_snp.md) | Pangenome analysis | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Core_Gene_SNP_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Core_Gene_SNP_PHB:main?tab=info) | +| [**Create_Terra_Table**](../workflows/data_import/create_terra_table.md)| Upload data to Terra and then run this workflow to have the table automatically created | Any taxa | | Yes | v2.2.0 | [Create_Terra_Table_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Create_Terra_Table_PHB:main?tab=info) | +| [**CZGenEpi_Prep**](../workflows/phylogenetic_construction/czgenepi_prep.md)| Prepare metadata and fasta files for easy upload to the CZ GEN EPI platform. | Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v1.3.0 | [CZGenEpi_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/CZGenEpi_Prep_PHB:main?tab=info) | +| [**Find_Shared_Variants**](../workflows/phylogenetic_construction/find_shared_variants.md)| Combines and reshapes variant data from Snippy_Variants to illustrate variants shared across multiple samples | Bacteria, Mycotics | Set-level | Yes | v2.0.0 | [Find_Shared_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Find_Shared_Variants_PHB:main?tab=info) | +| [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.2.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | +| [**GAMBIT_Query**](../workflows/standalone/gambit_query.md)| Taxon identification of genome assembly using GAMBIT | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [Gambit_Query_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Gambit_Query_PHB:main?tab=info) | +| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.0.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | +| [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | +| [**Lyve_SET**](../workflows/phylogenetic_construction/lyve_set.md)| Alignment of reads to a reference genome, SNP calling, curation of high quality SNPs, phylogenetic analysis | Bacteria | Set-level | Yes | v2.1.0 | [Lyve_SET_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Lyve_SET_PHB:main?tab=info) | +| [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | +| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.2.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | +| [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | +| [**Pangolin_Update**](../workflows/genomic_characterization/pangolin_update.md) | Update Pangolin assignments | SARS-CoV-2, Viral | Sample-level | Yes | v2.0.0 | [Pangolin_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Pangolin_Update_PHB:main?tab=info) | +| [**RASUSA**](../workflows/standalone/rasusa.md)| Randomly subsample sequencing reads to a specified coverage | Any taxa | Sample-level | Yes | v2.0.0 | [RASUSA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/RASUSA_PHB:main?tab=info) | +| [**Rename_FASTQ**](../workflows/standalone/rename_fastq.md)| Rename paired-end or single-end read files in a Terra data table in a non-destructive way | Any taxa | Sample-level | Yes | v2.1.0 | [Rename_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Rename_FASTQ_PHB:im-utilities-rename-files?tab=info) | +| [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/samples_to_ref_tree.md)| Use Nextclade to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Samples_to_Ref_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Samples_to_Ref_Tree_PHB:main?tab=info) | +| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | +| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | +| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**SRA_Fetch**](../workflows/data_import/sra_fetch.md)| Import publicly available reads from SRA using SRR#, ERR# or DRR# | Any taxa | Sample-level | Yes | v2.2.0 | [SRA_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/SRA_Fetch_PHB:main?tab=info) | +| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.0.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | +| [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | +| [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | +| [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | +| [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | +| [**Usher_PHB**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | +| [**Zip_Column_Content**](../workflows/data_export/zip_column_content.md)| Zip contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Zip_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Zip_Column_Content_PHB:main?tab=info) | + + +*[Sample-level]: This workflow is run once for each sample +*[Set-level]: This workflow is run once on a group of samples + + +*[Any taxa]: This workflow is organism-agnostic and can be run with any taxa +*[Viral]: This workflow is compatible with any viral pathogen +*[Bacteria]: This workflow is compatible with any bacterial pathogen +*[Mycotics]: This workflow is compatible with mycotic pathogens + + +[^1]: + Command-line compatibility is determined if the workflow can be run on a local command-line environment, providing all dependencies are installed, with either `miniwdl` or `cromwell`. +*[Some optional features incompatible]: Some optional features of this workflow are incompatible with command-line use and require modification +*[Yes]: This workflow is compatible with command-line use +*[No]: This workflow is not compatible with command-line use even with modifications diff --git a/docs/workflows_overview/workflows_kingdom.md b/docs/workflows_overview/workflows_kingdom.md new file mode 100644 index 000000000..095c8a170 --- /dev/null +++ b/docs/workflows_overview/workflows_kingdom.md @@ -0,0 +1,95 @@ +--- +title: Workflows by Kingdom +--- + +[Sort by Type](workflows_type.md) | [Sort Alphabetically](workflows_alphabetically.md) + +--- + +### Any Taxa + + +| **Name** | **Description** | **Taxa** | **Workflow Level** | **Command-line Compatible**[^1] | **Last known changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Assembly_Fetch**](../workflows/data_import/assembly_fetch.md) | Download assemblies from NCBI, after optionally identifying the closest RefSeq reference genome to your own draft assembly | Any taxa | Sample-level | Yes | v1.3.0 | [Assembly_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Assembly_Fetch_PHB:main?tab=info) | +| [**BaseSpace_Fetch**](../workflows/data_import/basespace_fetch.md)| Import data from BaseSpace into Terra | Any taxa | Sample-level | Yes | v2.0.0 | [BaseSpace_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/BaseSpace_Fetch_PHB:main?tab=info) | +| [**Concatenate_Column_Content**](../workflows/data_export/concatenate_column_content.md) | Concatenate contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Concatenate_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Column_Content_PHB:main?tab=info) | +| [**Create_Terra_Table**](../workflows/data_import/create_terra_table.md)| Upload data to Terra and then run this workflow to have the table automatically created | Any taxa | | Yes | v2.2.0 | [Create_Terra_Table_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Create_Terra_Table_PHB:main?tab=info) | +| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.0.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | +| [**RASUSA**](../workflows/standalone/rasusa.md)| Randomly subsample sequencing reads to a specified coverage | Any taxa | Sample-level | Yes | v2.0.0 | [RASUSA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/RASUSA_PHB:main?tab=info) | +| [**Rename_FASTQ**](../workflows/standalone/rename_fastq.md)| Rename paired-end or single-end read files in a Terra data table in a non-destructive way | Any taxa | Sample-level | Yes | v2.1.0 | [Rename_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Rename_FASTQ_PHB:im-utilities-rename-files?tab=info) | +| [**SRA_Fetch**](../workflows/data_import/sra_fetch.md)| Import publicly available reads from SRA using SRR#, ERR# or DRR# | Any taxa | Sample-level | Yes | v2.2.0 | [SRA_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/SRA_Fetch_PHB:main?tab=info) | +| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | +| [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | +| [**Zip_Column_Content**](../workflows/data_export/zip_column_content.md)| Zip contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Zip_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Zip_Column_Content_PHB:main?tab=info) | + +### Bacteria + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Core_Gene_SNP**](../workflows/phylogenetic_construction/core_gene_snp.md) | Pangenome analysis | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Core_Gene_SNP_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Core_Gene_SNP_PHB:main?tab=info) | +| [**Find_Shared_Variants**](../workflows/phylogenetic_construction/find_shared_variants.md)| Combines and reshapes variant data from Snippy_Variants to illustrate variants shared across multiple samples | Bacteria, Mycotics | Set-level | Yes | v2.0.0 | [Find_Shared_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Find_Shared_Variants_PHB:main?tab=info) | +| [**GAMBIT_Query**](../workflows/standalone/gambit_query.md)| Taxon identification of genome assembly using GAMBIT | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [Gambit_Query_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Gambit_Query_PHB:main?tab=info) | +| [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | +| [**Lyve_SET**](../workflows/phylogenetic_construction/lyve_set.md)| Alignment of reads to a reference genome, SNP calling, curation of high quality SNPs, phylogenetic analysis | Bacteria | Set-level | Yes | v2.1.0 | [Lyve_SET_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Lyve_SET_PHB:main?tab=info) | +| [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | +| [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | +| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | +| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | +| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.0.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | + +### Mycotics + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Cauris_CladeTyper**](../workflows/standalone/cauris_cladetyper.md)| C. auris clade assignment | Mycotics | Sample-level | Yes | v1.0.0 | [Cauris_CladeTyper_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Cauris_CladeTyper_PHB:main?tab=info) | +| [**Find_Shared_Variants**](../workflows/phylogenetic_construction/find_shared_variants.md)| Combines and reshapes variant data from Snippy_Variants to illustrate variants shared across multiple samples | Bacteria, Mycotics | Set-level | Yes | v2.0.0 | [Find_Shared_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Find_Shared_Variants_PHB:main?tab=info) | +| [**GAMBIT_Query**](../workflows/standalone/gambit_query.md)| Taxon identification of genome assembly using GAMBIT | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [Gambit_Query_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Gambit_Query_PHB:main?tab=info) | +| [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | +| [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | +| [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | + +### Viral + + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.1.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | +| [**CZGenEpi_Prep**](../workflows/phylogenetic_construction/czgenepi_prep.md)| Prepare metadata and fasta files for easy upload to the CZ GEN EPI platform. | Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v1.3.0 | [CZGenEpi_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/CZGenEpi_Prep_PHB:main?tab=info) | +| [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.2.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | +| [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | +| [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | +| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.2.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | +| [**Pangolin_Update**](../workflows/genomic_characterization/pangolin_update.md) | Update Pangolin assignments | SARS-CoV-2, Viral | Sample-level | Yes | v2.0.0 | [Pangolin_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Pangolin_Update_PHB:main?tab=info) | +| [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/samples_to_ref_tree.md)| Use Nextclade to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Samples_to_Ref_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Samples_to_Ref_Tree_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**Usher_PHB**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | +| [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | + + +*[Sample-level]: This workflow is run once for each sample +*[Set-level]: This workflow is run once on a group of samples + + +*[Any taxa]: This workflow is organism-agnostic and can be run with any taxa +*[Viral]: This workflow is compatible with any viral pathogen +*[Bacteria]: This workflow is compatible with any bacterial pathogen +*[Mycotics]: This workflow is compatible with mycotic pathogens + + +[^1]: + Command-line compatibility is determined if the workflow can be run on a local command-line environment, providing all dependencies are installed, with either `miniwdl` or `cromwell`. +*[Some optional features incompatible]: Some optional features of this workflow are incompatible with command-line use and require modification +*[Yes]: This workflow is compatible with command-line use +*[No]: This workflow is not compatible with command-line use even with modifications diff --git a/docs/workflows_overview/workflows_type.md b/docs/workflows_overview/workflows_type.md new file mode 100644 index 000000000..1fb16c7c3 --- /dev/null +++ b/docs/workflows_overview/workflows_type.md @@ -0,0 +1,97 @@ +--- +title: Workflows by Type +--- + +[Sort by Kingdom](workflows_kingdom.md) | [Sort Alphabetically](workflows_alphabetically.md) + +--- + +### Data Import + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Assembly_Fetch**](../workflows/data_import/assembly_fetch.md) | Download assemblies from NCBI, after optionally identifying the closest RefSeq reference genome to your own draft assembly | Any taxa | Sample-level | Yes | v1.3.0 | [Assembly_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Assembly_Fetch_PHB:main?tab=info) | +| [**BaseSpace_Fetch**](../workflows/data_import/basespace_fetch.md)| Import data from BaseSpace into Terra | Any taxa | Sample-level | Yes | v2.0.0 | [BaseSpace_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/BaseSpace_Fetch_PHB:main?tab=info) | +| [**Create_Terra_Table**](../workflows/data_import/create_terra_table.md)| Upload data to Terra and then run this workflow to have the table automatically created | Any taxa | | Yes | v2.2.0 | [Create_Terra_Table_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Create_Terra_Table_PHB:main?tab=info) | +| [**SRA_Fetch**](../workflows/data_import/sra_fetch.md)| Import publicly available reads from SRA using SRR#, ERR# or DRR# | Any taxa | Sample-level | Yes | v2.2.0 | [SRA_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/SRA_Fetch_PHB:main?tab=info) | + +### Genomic Characterization + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.2.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | +| [**Pangolin_Update**](../workflows/genomic_characterization/pangolin_update.md) | Update Pangolin assignments | SARS-CoV-2, Viral | Sample-level | Yes | v2.0.0 | [Pangolin_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Pangolin_Update_PHB:main?tab=info) | +| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | +| [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | + +### Phylogenetic Construction + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.1.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | +| [**Core_Gene_SNP**](../workflows/phylogenetic_construction/core_gene_snp.md) | Pangenome analysis | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Core_Gene_SNP_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Core_Gene_SNP_PHB:main?tab=info) | +| [**CZGenEpi_Prep**](../workflows/phylogenetic_construction/czgenepi_prep.md)| Prepare metadata and fasta files for easy upload to the CZ GEN EPI platform. | Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v1.3.0 | [CZGenEpi_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/CZGenEpi_Prep_PHB:main?tab=info) | +| [**Find_Shared_Variants**](../workflows/phylogenetic_construction/find_shared_variants.md)| Combines and reshapes variant data from Snippy_Variants to illustrate variants shared across multiple samples | Bacteria, Mycotics | Set-level | Yes | v2.0.0 | [Find_Shared_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Find_Shared_Variants_PHB:main?tab=info) | +| [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | +| [**Lyve_SET**](../workflows/phylogenetic_construction/lyve_set.md)| Alignment of reads to a reference genome, SNP calling, curation of high quality SNPs, phylogenetic analysis | Bacteria | Set-level | Yes | v2.1.0 | [Lyve_SET_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Lyve_SET_PHB:main?tab=info) | +| [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | +| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | +| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | +| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | + +### Phylogenetic Placement + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/samples_to_ref_tree.md)| Use Nextclade to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Samples_to_Ref_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Samples_to_Ref_Tree_PHB:main?tab=info) | +| [**Usher_PHB**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | + +### Public Data Sharing + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.2.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | +| [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | + +### Exporting Data from Terra + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Concatenate_Column_Content**](../workflows/data_export/concatenate_column_content.md) | Concatenate contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Concatenate_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Column_Content_PHB:main?tab=info) | +| [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | +| [**Zip_Column_Content**](../workflows/data_export/zip_column_content.md)| Zip contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Zip_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Zip_Column_Content_PHB:main?tab=info) | + +### Standalone + +| **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | +|---|---|---|---|---|---|---| +| [**Cauris_CladeTyper**](../workflows/standalone/cauris_cladetyper.md)| C. auris clade assignment | Mycotics | Sample-level | Yes | v1.0.0 | [Cauris_CladeTyper_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Cauris_CladeTyper_PHB:main?tab=info) | +| [**GAMBIT_Query**](../workflows/standalone/gambit_query.md)| Taxon identification of genome assembly using GAMBIT | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [Gambit_Query_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Gambit_Query_PHB:main?tab=info) | +| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.0.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | +| [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | +| [**RASUSA**](../workflows/standalone/rasusa.md)| Randomly subsample sequencing reads to a specified coverage | Any taxa | Sample-level | Yes | v2.0.0 | [RASUSA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/RASUSA_PHB:main?tab=info) | +| [**Rename_FASTQ**](../workflows/standalone/rename_fastq.md)| Rename paired-end or single-end read files in a Terra data table in a non-destructive way | Any taxa | Sample-level | Yes | v2.1.0 | [Rename_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Rename_FASTQ_PHB:im-utilities-rename-files?tab=info) | +| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.0.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | +| [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | + + +*[Sample-level]: This workflow is run once for each sample +*[Set-level]: This workflow is run once on a group of samples + + +*[Any taxa]: This workflow is organism-agnostic and can be run with any taxa +*[Viral]: This workflow is compatible with any viral pathogen +*[Bacteria]: This workflow is compatible with any bacterial pathogen +*[Mycotics]: This workflow is compatible with mycotic pathogens + + +[^1]: + Command-line compatibility is determined if the workflow can be run on a local command-line environment, providing all dependencies are installed, with either `miniwdl` or `cromwell`. +*[Some optional features incompatible]: Some optional features of this workflow are incompatible with command-line use and require modification +*[Yes]: This workflow is compatible with command-line use +*[No]: This workflow is not compatible with command-line use even with modifications diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..bc77de2b0 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,251 @@ +site_name: Public Health Bioinformatics +site_author: Theiagen Genomics +site_description: A collection of bioinformatics workflows for genomic characterization, submission preparation, and genomic epidemiology of pathogens of public health concern. +site_url: https://theiagen.github.io/public_health_bioinformatics/ + +repo_url: https://github.com/theiagen/public_health_bioinformatics + +nav: + - Public Health Bioinformatics: + - Home: index.md + - Getting Started: + - With the Command-Line: getting_started/commandline.md + - With Terra.bio: getting_started/terra.md + - Workflows: + - Workflows by Type: + - Overview Table: workflows_overview/workflows_type.md + - Data Import: + - Assembly_Fetch: workflows/data_import/assembly_fetch.md + - BaseSpace_Fetch: workflows/data_import/basespace_fetch.md + - Create_Terra_Table: workflows/data_import/create_terra_table.md + - SRA_Fetch: workflows/data_import/sra_fetch.md + - Genomic Characterization: + - Freyja Workflow Series: workflows/genomic_characterization/freyja.md + - Pangolin_Update: workflows/genomic_characterization/pangolin_update.md + - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md + - TheiaEuk: workflows/genomic_characterization/theiaeuk.md + - TheiaMeta: workflows/genomic_characterization/theiameta.md + - TheiaProk Workflow Series: workflows/genomic_characterization/theiaprok.md + - VADR_Update: workflows/genomic_characterization/vadr_update.md + - Phylogenetic Construction: + - Augur: workflows/phylogenetic_construction/augur.md + - Core_Gene_SNP: workflows/phylogenetic_construction/core_gene_snp.md + - CZGenEpi_Prep: workflows/phylogenetic_construction/czgenepi_prep.md + - Find_Shared_Variants: workflows/phylogenetic_construction/find_shared_variants.md + - kSNP3: workflows/phylogenetic_construction/ksnp3.md + - Lyve_SET: workflows/phylogenetic_construction/lyve_set.md + - MashTree_FASTA: workflows/phylogenetic_construction/mashtree_fasta.md + - Snippy_Streamline: workflows/phylogenetic_construction/snippy_streamline.md + - Snippy_Streamline_FASTA: workflows/phylogenetic_construction/snippy_streamline_fasta.md + - Snippy_Tree: workflows/phylogenetic_construction/snippy_tree.md + - Snippy_Variants: workflows/phylogenetic_construction/snippy_variants.md + - Phylogenetic Placement: + - Samples_to_Ref_Tree: workflows/phylogenetic_placement/samples_to_ref_tree.md + - Usher_PHB: workflows/phylogenetic_placement/usher.md + - Public Data Sharing: + - Mercury_Prep_N_Batch: workflows/public_data_sharing/mercury_prep_n_batch.md + - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md + - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md + - Exporting Data from Terra: + - Concatenate_Column_Content: workflows/data_export/concatenate_column_content.md + - Transfer_Column_Content: workflows/data_export/transfer_column_content.md + - Zip_Column_Content: workflows/data_export/zip_column_content.md + - Standalone: + - Cauris_CladeTyper: workflows/standalone/cauris_cladetyper.md + - GAMBIT_Query: workflows/standalone/gambit_query.md + - Kraken2: workflows/standalone/kraken2.md + - NCBI-AMRFinderPlus: workflows/standalone/ncbi_amrfinderplus.md + - RASUSA: workflows/standalone/rasusa.md + - Rename_FASTQ: workflows/standalone/rename_fastq.md + - TBProfiler_tNGS: workflows/standalone/tbprofiler_tngs.md + - TheiaValidate: workflows/standalone/theiavalidate.md + - Workflows by Kingdom: + - Overview Table: workflows_overview/workflows_kingdom.md + - Any Taxa: + - Assembly_Fetch: workflows/data_import/assembly_fetch.md + - BaseSpace_Fetch: workflows/data_import/basespace_fetch.md + - Concatenate_Column_Content: workflows/data_export/concatenate_column_content.md + - Create_Terra_Table: workflows/data_import/create_terra_table.md + - Kraken2: workflows/standalone/kraken2.md + - RASUSA: workflows/standalone/rasusa.md + - Rename_FASTQ: workflows/standalone/rename_fastq.md + - SRA_Fetch: workflows/data_import/sra_fetch.md + - TheiaMeta: workflows/genomic_characterization/theiameta.md + - TheiaValidate: workflows/standalone/theiavalidate.md + - Transfer_Column_Content: workflows/data_export/transfer_column_content.md + - Zip_Column_Content: workflows/data_export/zip_column_content.md + - Bacteria: + - Core_Gene_SNP: workflows/phylogenetic_construction/core_gene_snp.md + - Find_Shared_Variants: workflows/phylogenetic_construction/find_shared_variants.md + - GAMBIT_Query: workflows/standalone/gambit_query.md + - kSNP3: workflows/phylogenetic_construction/ksnp3.md + - Lyve_SET: workflows/phylogenetic_construction/lyve_set.md + - MashTree_FASTA: workflows/phylogenetic_construction/mashtree_fasta.md + - NCBI-AMRFinderPlus: workflows/standalone/ncbi_amrfinderplus.md + - Snippy_Streamline: workflows/phylogenetic_construction/snippy_streamline.md + - Snippy_Streamline_FASTA: workflows/phylogenetic_construction/snippy_streamline_fasta.md + - Snippy_Tree: workflows/phylogenetic_construction/snippy_tree.md + - Snippy_Variants: workflows/phylogenetic_construction/snippy_variants.md + - TBProfiler_tNGS: workflows/standalone/tbprofiler_tngs.md + - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md + - TheiaProk Workflow Series: workflows/genomic_characterization/theiaprok.md + - Mycotics: + - Cauris_CladeTyper: workflows/standalone/cauris_cladetyper.md + - Find_Shared_Variants: workflows/phylogenetic_construction/find_shared_variants.md + - GAMBIT_Query: workflows/standalone/gambit_query.md + - kSNP3: workflows/phylogenetic_construction/ksnp3.md + - MashTree_FASTA: workflows/phylogenetic_construction/mashtree_fasta.md + - NCBI-AMRFinderPlus: workflows/standalone/ncbi_amrfinderplus.md + - Snippy_Variants: workflows/phylogenetic_construction/snippy_variants.md + - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md + - TheiaEuk: workflows/genomic_characterization/theiaeuk.md + - Viral: + - Augur: workflows/phylogenetic_construction/augur.md + - CZGenEpi_Prep: workflows/phylogenetic_construction/czgenepi_prep.md + - Freyja Workflow Series: workflows/genomic_characterization/freyja.md + - kSNP3: workflows/phylogenetic_construction/ksnp3.md + - MashTree_FASTA: workflows/phylogenetic_construction/mashtree_fasta.md + - Mercury_Prep_N_Batch: workflows/public_data_sharing/mercury_prep_n_batch.md + - Pangolin_Update: workflows/genomic_characterization/pangolin_update.md + - Samples_to_Ref_Tree: workflows/phylogenetic_placement/samples_to_ref_tree.md + - Snippy_Variants: workflows/phylogenetic_construction/snippy_variants.md + - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md + - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md + - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md + - Usher_PHB: workflows/phylogenetic_placement/usher.md + - VADR_Update: workflows/genomic_characterization/vadr_update.md + - Workflows Alphabetically: + - Overview Table: workflows_overview/workflows_alphabetically.md + - Assembly_Fetch: workflows/data_import/assembly_fetch.md + - Augur: workflows/phylogenetic_construction/augur.md + - BaseSpace_Fetch: workflows/data_import/basespace_fetch.md + - Cauris_CladeTyper: workflows/standalone/cauris_cladetyper.md + - Concatenate_Column_Content: workflows/data_export/concatenate_column_content.md + - Core_Gene_SNP: workflows/phylogenetic_construction/core_gene_snp.md + - Create_Terra_Table: workflows/data_import/create_terra_table.md + - CZGenEpi_Prep: workflows/phylogenetic_construction/czgenepi_prep.md + - Find_Shared_Variants: workflows/phylogenetic_construction/find_shared_variants.md + - Freyja Workflow Series: workflows/genomic_characterization/freyja.md + - GAMBIT_Query: workflows/standalone/gambit_query.md + - Kraken2: workflows/standalone/kraken2.md + - kSNP3: workflows/phylogenetic_construction/ksnp3.md + - Lyve_SET: workflows/phylogenetic_construction/lyve_set.md + - MashTree_FASTA: workflows/phylogenetic_construction/mashtree_fasta.md + - Mercury_Prep_N_Batch: workflows/public_data_sharing/mercury_prep_n_batch.md + - NCBI-AMRFinderPlus: workflows/standalone/ncbi_amrfinderplus.md + - Pangolin_Update: workflows/genomic_characterization/pangolin_update.md + - RASUSA: workflows/standalone/rasusa.md + - Rename_FASTQ: workflows/standalone/rename_fastq.md + - Samples_to_Ref_Tree: workflows/phylogenetic_placement/samples_to_ref_tree.md + - Snippy_Streamline: workflows/phylogenetic_construction/snippy_streamline.md + - Snippy_Streamline_FASTA: workflows/phylogenetic_construction/snippy_streamline_fasta.md + - Snippy_Tree: workflows/phylogenetic_construction/snippy_tree.md + - Snippy_Variants: workflows/phylogenetic_construction/snippy_variants.md + - SRA_Fetch: workflows/data_import/sra_fetch.md + - TBProfiler_tNGS: workflows/standalone/tbprofiler_tngs.md + - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md + - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md + - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md + - TheiaEuk: workflows/genomic_characterization/theiaeuk.md + - TheiaMeta: workflows/genomic_characterization/theiameta.md + - TheiaProk Workflow Series: workflows/genomic_characterization/theiaprok.md + - TheiaValidate: workflows/standalone/theiavalidate.md + - Transfer_Column_Content: workflows/data_export/transfer_column_content.md + - Usher_PHB: workflows/phylogenetic_placement/usher.md + - VADR_Update: workflows/genomic_characterization/vadr_update.md + - Zip_Column_Content: workflows/data_export/zip_column_content.md + - Contributing: + - Contributing to the Documentation: contributing/doc_contribution.md + - Contributing to the Code: contributing/code_contribution.md + +theme: + name: material + logo: assets/logos/Theiagen-Logo-White.png + favicon: assets/logos/Theiagen-Symbol-Standard-01.png + custom_dir: docs/overrides + features: + - content.code.annotate + - content.code.copy + - content.tabs.link + - content.tooltips + #- navigation.expand + - navigation.instant + #- navigation.instant.preview + - navigation.path + - navigation.tabs + - navigation.top + - navigation.tracking + - search.highlight + - search.suggest + - toc.follow + #- toc.integrate + language: en + palette: + - media: "(prefers-color-scheme: light)" + scheme: light + toggle: + icon: material/weather-night + name: Switch to Dark Mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + accent: indigo + primary: black + toggle: + icon: material/white-balance-sunny + name: Switch to Light Mode + +markdown_extensions: + - abbr + - admonition + - attr_list + - def_list + - footnotes + - md_in_html + - pymdownx.details + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.keys + - pymdownx.mark + - pymdownx.caret + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.tasklist: + custom_checkbox: true + - toc: + permalink: true + +plugins: + - git-revision-date-localized: + enable_creation_date: true + type: iso_date + - search + - glightbox + - mike + # - section-index + +extra_javascript: + - https://unpkg.com/tablesort@5.3.0/dist/tablesort.min.js + - javascripts/tablesort.js + +extra_css: + - stylesheets/extra.css + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/theiagen + - icon: fontawesome/brands/twitter + link: https://twitter.com/theiagen + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/company/theiagen + version: + provider: mike + default: latest + alias: true + homepage: https://www.theiagen.com + +copyright: | + © 2022-2024 Theiagen Genomics \ No newline at end of file diff --git a/tasks/utilities/data_import/task_create_terra_table.wdl b/tasks/utilities/data_import/task_create_terra_table.wdl index e3edadb36..638052ab0 100644 --- a/tasks/utilities/data_import/task_create_terra_table.wdl +++ b/tasks/utilities/data_import/task_create_terra_table.wdl @@ -3,7 +3,7 @@ version 1.0 task create_terra_table { input { String new_table_name - String data_location_path + String data_location_path # include final `/` if it is a directory String? file_ending # comma-delimited list Boolean paired_end Boolean assembly_data