diff --git a/.dockstore.yml b/.dockstore.yml index f289bd287..6b6bac60f 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -195,6 +195,11 @@ workflows: primaryDescriptorPath: /workflows/utilities/data_import/wf_terra_2_bq.wdl testParameterFiles: - /tests/inputs/empty.json + - name: Fetch_SRR_Accession_PHB + subclass: WDL + primaryDescriptorPath: /workflows/utilities/data_import/wf_fetch_srr_accession.wdl + testParameterFiles: + - /tests/inputs/empty.json - name: Concatenate_Column_Content_PHB subclass: WDL primaryDescriptorPath: /workflows/utilities/file_handling/wf_concatenate_column.wdl @@ -287,4 +292,9 @@ workflows: subclass: WDL primaryDescriptorPath: /workflows/utilities/wf_dorado_basecalling.wdl testParameterFiles: - - /tests/inputs/empty.json \ No newline at end of file + - /tests/inputs/empty.json + - name: Concatenate_Illumina_Lanes_PHB + subclass: WDL + primaryDescriptorPath: /workflows/utilities/file_handling/wf_concatenate_illumina_lanes.wdl + testParameterFiles: + - /tests/inputs/empty.json diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 3aebe21d9..96f294e54 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -45,7 +45,8 @@ This PR uses an element that could cause duplicate runs to have different result - [ ] The workflow/task has been tested and results, including file contents, are as anticipated - [ ] The CI/CD has been adjusted and tests are passing (Theiagen developers) - [ ] Code changes follow the [style guide](https://theiagen.notion.site/Style-Guide-WDL-Workflow-Development-51b66a47dde54c798f35d673fff80249) -- [ ] Documentation and/or workflow diagrams have been updated if applicable (Theiagen developers only) +- [ ] Documentation and/or workflow diagrams have been updated if applicable + - [ ] You have updated the latest version for any affected worklows in the respective workflow documentation page and for every entry in the three `workflows_overview` tables. ## 🎯 Reviewer Checklist diff --git a/README.md b/README.md index 84bccba76..d4f1dea8e 100644 --- a/README.md +++ b/README.md @@ -42,30 +42,32 @@ You can expect a careful review of every PR and feedback as needed before mergin ### Authorship -(Ordered by contribution [# of lines changed] as of 2024-08-01) +(Ordered by contribution [# of lines changed] as of 2024-12-04) * **Sage Wright** ([@sage-wright](https://github.com/sage-wright)) - Conceptualization, Software, Validation, Supervision * **InĂȘs Mendes** ([@cimendes](https://github.com/cimendes)) - Software, Validation * **Curtis Kapsak** ([@kapsakcj](https://github.com/kapsakcj)) - Conceptualization, Software, Validation -* **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - Software, Validation * **Frank Ambrosio** ([@frankambrosio3](https://github.com/frankambrosio3)) - Conceptualization, Software, Validation * **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation * **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision -* **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation +* **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation * **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision +* **Michal Babinski** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation * **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision * **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation -* **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) - Validation * **Joel Sevinsky** ([@sevinsky](https://github.com/sevinsky)) - Conceptualization, Project Administration, Supervision ### External Contributors We would like to gratefully acknowledge the following individuals from the public health community for their contributions to the PHB repository: +* **James Otieno** ([@jrotieno](https://github.com/jrotieno)) * **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) +* **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) * **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) * **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) * **Holly Halstead** ([@HNHalstead](https://github.com/HNHalstead)) +* **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) ### Maintaining PHB Pipelines diff --git a/docs/assets/figures/Freyja_FASTQ.png b/docs/assets/figures/Freyja_FASTQ.png index 1789c8c53..75eb466c2 100644 Binary files a/docs/assets/figures/Freyja_FASTQ.png and b/docs/assets/figures/Freyja_FASTQ.png differ diff --git a/docs/assets/figures/TheiaEuk_Illumina_PHB_20241106.png b/docs/assets/figures/TheiaEuk_Illumina_PHB_20241106.png new file mode 100644 index 000000000..241b7bb8b Binary files /dev/null and b/docs/assets/figures/TheiaEuk_Illumina_PHB_20241106.png differ diff --git a/docs/assets/new_workflow_template.md b/docs/assets/new_workflow_template.md index 9e7ef6799..41c2b1895 100644 --- a/docs/assets/new_workflow_template.md +++ b/docs/assets/new_workflow_template.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Workflow Type](../../workflows_overview/workflows_type.md/#link-to-workflow-type) | [Applicable Kingdom](../../workflows_overview/workflows_kingdom.md/#link-to-applicable-kingdom) | PHB | | | +| [Link to Workflow Type](../../workflows_overview/workflows_type.md/#link-to-workflow-type) | [Link to Applicable Kingdom](../../workflows_overview/workflows_kingdom.md/#link-to-applicable-kingdom) | PHB | | | ## Workflow_Name_On_Terra @@ -12,6 +12,8 @@ Description of the workflow. ### Inputs +Input should be ordered as they appear on Terra + | **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| | task_name | **variable_name** | Type | Description | Default Value | Required/Optional | @@ -24,12 +26,12 @@ Description of the workflow tasks Description of the task !!! techdetails "Tool Name Technical Details" - | | Links | - | --- | --- | + | | Links | + | --- | --- | | Task | [link to task on GitHub] | | Software Source Code | [link to tool's source code] | | Software Documentation | [link to tool's documentation] | - | Original Publication | [link to tool's publication] | + | Original Publication(s) | [link to tool's publication] | ### Outputs diff --git a/docs/contributing/code_contribution.md b/docs/contributing/code_contribution.md index cb7ba5727..d5819b9d3 100644 --- a/docs/contributing/code_contribution.md +++ b/docs/contributing/code_contribution.md @@ -8,8 +8,10 @@ Style guide inspired by Scott Frazer’s [WDL Best Practices Style Guide](http ## General Guidelines -- Put tasks and workflows in separate files in the appropriate folders. -- Always add a description as metadata +***Modularity and Metadata*** + +- **Best Practice:** Place tasks and workflows in separate files to maintain modularity and clarity. +- **Add a `meta` block** to every task and workflow to provide a brief description of its purpose. ```bash meta { @@ -17,163 +19,262 @@ Style guide inspired by Scott Frazer’s [WDL Best Practices Style Guide](http } ``` -- Ensure that the docker container is locked to a given version, not `latest` +***Docker Containers*** + +- Use a specific Docker container version instead of 'latest' to ensure reproducibility and prevent unexpected changes in container behavior. ```bash - String docker = "quay.io/docker_image:version" + String docker = "us-docker.pkg.dev/docker_image:version" ``` - Preferentially use containers [`Google's Artifact Registry`](https://console.cloud.google.com/artifacts/docker/general-theiagen/us) rather than those from [`quay.io`](http://quay.io) or [`dockerhub`](https://hub.docker.com/) -- Use 2-space indents (no tabs) + +***Indentation and Whitespace*** + +- Use 2-space indentation for all blocks. Avoid using tabs to ensure uniform formatting across editors: ```bash # perform action - if [ this ]; then - action1(variable) + if [ condition ]; then + perform_action(variable) fi ``` -- Do not use line break for opening braces +- Use a single space when defining variables (`this = that` *not* `this= that` (unless a bash variable where `this=that` is required)) + +***Bracket and Spacing Conventions*** + +- Avoid line breaks for opening braces. Keep them on the same line as the declaration. i.e `input {` instead of `input\n{` + + ```bash + # Correct + input { + String input_variable + } + + # Incorrect + input + { + String input_variable + } + ``` + - Use single space when defining input/output variables & runtime attributes (`output {` instead of `output{`) -- Use single-line breaks between non-intended constructs -- Enclose task commands with triple angle brackets (`<<< ... >>>`) -- Consistently use white space with variables (`this = that` *not* `this= that` (unless a bash variable where `this=that` is required)) +- Separate non-indented constructs (like input and output sections) with a single-line break for readability. + +***Command Block Syntax*** + +- Enclose command blocks in triple angle brackets (<<< ... >>>) for consistency and easier handling of multi-line scripts. It also avoids issues with unescaped special characters in the command block: + + ```bash + command <<< + tool --input ~{input} --output ~{output} + >>> + ``` ## Task Blocks -The task should contain the following sections. Include _single_ spaces between input, command, output, and runtime closing and opening curly brackets. +A WDL task block defines a discrete, reusable step in a workflow. To ensure readability and consistency, follow these conventions when writing task blocks. Include single spaces between the input, command, output, and runtime sections and their enclosing curly brackets. ```bash -input { +task example_task { + input { -} -command <<< + } + command <<< + + >>> + output { ->>> -output { - -} -runtime { + } + runtime { + } } ``` -??? toggle "`input` block" - - The following conventions are used to expose docker, CPU, memory, and disk size +### The `input` block - ```bash - input { - String docker = "..." - Int cpu = x - Int memory = y - Int disk_size = z - } - ``` - - - If additional arguments should be allowed to be passed to the task, this input should follow the convention below: - - ```bash - input { - String args = "" - } - ``` - - - Input and output lists should not be formatted to have the equal sign aligned, but instead use a single space before and after the `=` - - ```bash - output1_x = string1 - output2_that_does_y = string2 - ``` - - - Ensure the docker container is exposed as an input and as an output string - - ```bash - input { - String docker = "" - } - ... - output { - String XX_docker = docker - } - runtime { - docker: docker - } - ``` +- The following conventions are used to expose docker, CPU, memory, and disk size: -??? toggle "`command` block" - - Ensure use of line breaks between different sections of code to improve readability - - ```bash - # if this, perform action 1 - if [ this ]; then - action1(variable) - fi - - # if that, perform action 2 - if [ that ]; then - action2(variable) - fi - ``` - - - Split command calls into multiple lines if they have user input variables and/or if the length of the command is very long to avoid text wrapping and/or side-scrolling, e.g. - - Use indentation as appropriate - - ```bash - tool \ - --option1 ~{option1} \ - --option2 ~{option2} \ - ... - --option999 ~{option999} - ``` - - - Add comments that - - Explain what the optional parameters are - - Provide links to the tool documentation so future readers of the code know where to find that information - - Explain what non-intuitive bash/python text wrangling actions do, e.g. - - ```bash - ## awk for gene column ($6) to grab subtype ($15) - cat ~{file} | awk -F '\t' '{if ($6=="M1") print $15}' > FLU_TYPE - ``` - -??? toggle "`output` block" - - File types should be clearly stated in the output name variables + ```bash + input { + Int cpu = 4 # Number of CPUs + Int disk_size = 100 # Disk space in GB + String docker = "us-docker.pkg.dev/example:1.0.0" # Docker container for the task + Int memory = 16 # Memory in GB + } + ``` + +- Include optional tool parameters as inputs to the task + + ```bash + input { + Int? optional_tool_parameter1 + String optional_tool_parameter2_with_default = "default_value" + } + ``` + +- Input and output lists should **not** be formatted to have the equal sign aligned, but instead **use a single space** before and after the `=` + + ```bash + correct_output = "output_file" + long_variable_name = "long_file_name" + ``` + +- Expose Docker as an input, an output (if versioning information not available), and runtime variable: + + ```bash + input { + String docker = "us-docker.pkg.dev/example:1.0.0" + } + ... + output { + String used_docker = docker + } + runtime { + docker: docker + } + ``` + +### The `command` block + +- Ensure use of line breaks between different sections of code to improve readability + + ```bash + # Perform task step 1 + if [ condition ]; then + action1(variable) + fi + + # Perform task step 2 + if [ another_condition ]; then + action2(variable) + fi + ``` + +- Split command calls into multiple lines if they have user input variables and/or if the length of the command is very long to avoid text wrapping and/or side-scrolling, e.g. + - Use backslashes for continuation and indentation to clarify structure: + + ```bash + tool \ + --input ~{input_file} \ + --output ~{output_file} \ + --option1 ~{option1} \ + ... + --optionN ~{optionN} + ``` + +- Add comments that + - Explain what the optional parameters are + - Provide links to the tool documentation so future readers of the code know where to find that information + - Explain what non-intuitive bash/python text wrangling actions do, e.g. ```bash - output1_csv = file1.csv - output2_tsv = file2.tsv - ``` - - - Ensure the docker container is exposed as an output string, e.g. - - ```bash - input { - String docker - } - ... - output { - String XX_docker = docker - } - runtime { - docker: docker - } + ## awk for gene column ($6) to grab subtype ($15) + cat ~{file} | awk -F '\t' '{if ($6=="M1") print $15}' > FLU_TYPE ``` -??? toggle "`runtime` block" - - Always use a docker container +### The `output` block + +- The output block specifies the files or variables produced by the task. Follow these conventions: + + ```bash + output { + File result_csv = "output.csv" # CSV file generated + File result_log = "log.txt" # Log file + } + ``` + +- Ensure the docker container is exposed as an output string, e.g. + + ```bash + input { + String docker = "us-docker.pkg.dev/general-theiagen/tool:version" + } + ... + output { + String XX_docker = docker + } + runtime { + docker: docker + } + ``` + +### The `runtime` block + +- The runtime block defines the compute resources and environment for the task. +- Always specify a Docker: + + ```bash + runtime { + docker: docker + cpu: cpu + memory: memory + disk: disk_size + } + ``` ## Workflow Blocks -The workflow/sub-workflow file should contain: +A WDL workflow block orchestrates the execution of tasks and subworkflows. It defines the inputs, calls tasks or subworkflows, and specifies the final outputs. To ensure readability and consistency, follow these conventions when writing workflow blocks: + +### The `import` section + +- Include a block of `import` statements (sorted in alphabetical order). + - When a workflow imports a task, ensure it is imported under a unique name to avoid conflicts. -- a block of `import` statements (alphabetical order), - - When a workflow imports a task, make sure that it is imported under a different name than the task it is calling -- a `workflow` block with - - an `input` section - - `call` sections for specified tasks - - an `output` section + ```bash + import "../tasks/task_task1.wdl" as task1_task + import "../tasks/task_task2.wdl" as task2_task + ``` -Example formatting is shown below. +- Order import statements alphabetically by the path of the imported file. + +### The `input` block + +- Optional inputs that should be able to be edited by the user, such as docker containers should be exposed on the workflow level as in the example +- In the case of subworkflows, all optional inputs should be exposed on the workflow level so that they can be modified by users on Terra + +```bash +input { + String input + String task1_docker = "us-docker.pkg.dev/general-theiagen/tool:version" + String? task1_optional_argument +} +``` + +### The `call` sections + +- Import task files as something other than the included task nam in order to avoid namespace conflicts + +```bash +call task1_task.task1 { + input: + input = input, + docker = task1_docker +} +``` + +### The `output` block + +- Define all workflow outputs in this section. +- Use descriptive names for each output variable. +- Order outputs alphabetically by the name of the output variable + +```bash +output { + # Task 1 outputs + File task1_out_csv = task1.output_csv + String task1_version = task1.version + + # Subworkflow outputs + File subworkflow_out_tsv = subworkflow.task3_out_tsv + String subworkflow_version = subworkflow.task3_version +} +``` + +## Example Workflow formats ??? toggle "wf_example_wf.wdl" @@ -190,7 +291,6 @@ Example formatting is shown below. String task2_docker = "us-docker.pkg.dev/general-theiagen//task_2:version" String? hidden_task3_argument String? hidden_task3_docker - String? hidden_task4_argument String? hidden_task4_docker } call task1_task.task1 { @@ -205,7 +305,10 @@ Example formatting is shown below. } call subworkflow.subworkflow { input: - input = input + input = input, + task3_argument = hidden_task3_argument, + task3_docker = hidden_task3_docker + task4_docker = hidden_task4_docker } output { # Task 1 outputs @@ -216,16 +319,19 @@ Example formatting is shown below. File task2_out_tsv = task2.output_tsv String task2_version = task2.version String task2_docker = task2.docker - # Subworkflow outputs + # Subworkflow outputs for task 3 File task3_out_tsv = subworkflow.task3_out_tsv String task3_version = subworkflow.task3_version String task3_docker = subworkflow.task3_docker + # Subworkflow outputs for task 4 + String task4_output = subworkflow.task4_output + String task4_version = subworkflow.task4_version } } ``` - ??? toggle "wf_subworkflow.wdl" + ```bash import "../tasks/task_task3.wdl" as task3_task import "../tasks/task_task4.wdl" as task4_task @@ -239,6 +345,7 @@ Example formatting is shown below. # level so they can be modified by a Terra user String? task3_argument String? task3_docker + String? task4_docker } call task3_task.task3 { input: @@ -246,38 +353,17 @@ Example formatting is shown below. args = task3_argument, docker = task3_docker } + call task4_task.task4 { + input: + input = task3.output_tsv, + docker = task4_docker + } output { File task3_out_tsv = task3.output_tsv String task3_version = task3.version String task3_docker = task3.docker + String task4_output = task4.output + String task4_version = task4.version } } ``` - ---- - -??? toggle "`input` section" - - Optional inputs that should be able to be edited by the user, such as docker containers should be exposed on the workflow level as in the example - - In the case of subworkflows, all optional inputs should be exposed on the workflow level so that they can be modified by users on Terra - -??? toggle "`call` task sections" - - There should be no blank lines between tasks in workflows - - ```bash - task A { - } - task B { - } - ``` - - - Label a group of outputs by the source/species for organizational purposes when a workflow has many different outputs - - ```ebnf - output { - ... - # task99 outputs - String task99_ouput - String task99_file - ... - } - ``` diff --git a/docs/contributing/doc_contribution.md b/docs/contributing/doc_contribution.md index 7f20e5491..4ddb7e0de 100644 --- a/docs/contributing/doc_contribution.md +++ b/docs/contributing/doc_contribution.md @@ -14,7 +14,7 @@ To test your documentation changes, you will need to have the following packages pip install mkdocs-material mkdocs-material-extensions mkdocs-git-revision-date-localized-plugin mike mkdocs-glightbox ``` -The live preview server can be activated by running the following command: +Once installed, navigate to the top directory in PHB. The live preview server can be activated by running the following command: ```bash mkdocs serve @@ -34,49 +34,7 @@ Here are some VSCode Extensions can help you write and edit your markdown files - [Excel to Markdown Table](https://tableconvert.com/excel-to-markdown) - This website will convert an Excel table into markdown format, which can be copied and pasted into your markdown file. - [Material for MkDocs Reference](https://squidfunk.github.io/mkdocs-material/reference/) - This is the official reference for the Material for MkDocs theme, which will help you understand how to use the theme's features. -- [Broken Link Check](https://www.brokenlinkcheck.com/) - This website will scan your website to ensure that all links are working correctly. This will only work on the deployed version of the documentation, not the local version. - -## Documentation Structure - -A brief description of the documentation structure is as follows: - -- `docs/` - Contains the Markdown files for the documentation. - - `assets/` - Contains images and other files used in the documentation. - - `figures/` - Contains images, figures, and workflow diagrams used in the documentation. For workflows that contain many images (such as BaseSpace_Fetch), it is recommended to create a subdirectory for the workflow. - - `files/` - Contains files that are used in the documentation. This may include example outputs or templates. For workflows that contain many files (such as TheiaValidate), it is recommended to create a subdirectory for the workflow. - - `logos/` - Contains Theiagen logos and symbols used int he documentation. - - `metadata_formatters/` - Contains the most up-to-date metadata formatters for our submission workflows. - - `new_workflow_template.md` - A template for adding a new workflow page to the documentation. - - `contributing/` - Contains the Markdown files for our contribution guides, such as this file - - `javascripts/` - Contains JavaScript files used in the documentation. - - `tablesort.js` - A JavaScript file used to enable table sorting in the documentation. - - `overrides/` - Contains HTMLs used to override theme defaults - - `main.html` - Contains the HTML used to display a warning when the latest version is not selected - - `stylesheets/` - Contains CSS files used in the documentation. - - `extra.css` - A custom CSS file used to style the documentation; contains all custom theme elements (scrollable tables, resizable columns, Theiagen colors), and custom admonitions. - - `workflows/` - Contains the Markdown files for each workflow, organized into subdirectories by workflow category - - `workflows_overview/` - Contains the Markdown files for the overview tables for each display type: alphabetically, by applicable kingdom, and by workflow type. - - `index.md` - The home/landing page for our documentation. - -### Adding a Page for a New Workflow {#new-page} - -If you are adding a new workflow, there are a number of things to do in order to include the page in the documentation: - -1. Add a page with the title of the workflow to appropriate subdirectory in `docs/workflows/`. Feel free to use the template found in the `assets/` folder. -2. Collect the following information for your new workflow: - - Workflow Name - Link the name with a relative path to the workflow page in appropriate `docs/workflows/` subdirectory - - Workflow Description - Brief description of the workflow - - Applicable Kingdom - Options: "Any taxa", "Bacteria", "Mycotics", "Viral" - - Workflow Level (_on Terra_) - Options: "Sample-level", "Set-level", or neither - - Command-line compatibility - Options: "Yes", "No", and/or "Some optional features incompatible" - - The version where the last known changes occurred (likely the upcoming version if it is a new workflow) - - Link to the workflow on Dockstore (if applicable) - Workflow name linked to the information tab on Dockstore. -3. Format this information in a table. -4. Copy the previously gathered information to ==**ALL THREE**== overview tables in `docs/workflows_overview/`: - - `workflows_alphabetically.md` - Add the workflow in the appropriate spot based on the workflow name. - - `workflows_kingdom.md` - Add the workflow in the appropriate spot(s) based on the kingdom(s) the workflow is applicable to. Make sure it is added alphabetically within the appropriate subsection(s). - - `workflows_type.md` - Add the workflow in the appropriate spot based on the workflow type. Make sure it is added alphabetically within the appropriate subsection. -5. Copy the path to the workflow to ==**ALL**== of the appropriate locations in the `mkdocs.yml` file (under the `nav:` section) in the main directory of this repository. These should be the exact same spots as in the overview tables but without additional information. This ensures the workflow can be accessed from the navigation sidebar. +- [Dead Link Check](https://www.deadlinkchecker.com/) - This website will scan your website to ensure that all links are working correctly. This will only work on the deployed version of the documentation, not the local version. ## Standard Language & Formatting Conventions @@ -98,10 +56,11 @@ The following language conventions should be followed when writing documentation - **Bold Text** - Use `**bold text**` to indicate text that should be bolded. - _Italicized Text_ - Use `_italicized text_` to indicate text that should be italicized. - ==Highlighted Text== - Use `==highlighted text==` to indicate text that should be highlighted. -- `Code` - Use \`code\` to indicate text that should be formatted as code. +- `Code` - Use ````code` ``` (backticks) to indicate text that should be formatted as code. - ^^Underlined Text^^ - Use `^^underlined text^^` to indicate text that should be underlined (works with our theme; not all Markdown renderers support this). - > Citations - Use a `>` to activate quote formatting for a citation. Make sure to separate multiple citations with a comment line (``) to prevent the citations from running together. + - Use a reputable citation style (e.g., Vancouver, Nature, etc.) for all citations. - Callouts/Admonitions - These features are called "call-outs" in Notion, but are "Admonitions" in MkDocs. [I highly recommend referring to the Material for MkDocs documentation page on Admonitions to learn how best to use this feature](https://squidfunk.github.io/mkdocs-material/reference/admonitions/). Use the following syntax to create a callout: ```markdown @@ -116,18 +75,37 @@ The following language conventions should be followed when writing documentation !!! dna This is a DNA admonition. Admire the cute green DNA emoji. You can create this with the `!!! dna` syntax. + Use this admonition when wanting to convey general information or highlight specific facts. + ???+ toggle This is a toggle-able section. The emoji is an arrow pointing to the right downward. You can create this with the `??? toggle` syntax. I have added a `+` at the end of the question marks to make it open by default. + Use this admonition when wanting to provide additional _optional_ information or details that are not strictly necessary, or take up a lot of space. + ???+ task This is a toggle-able section **for a workflow task**. The emoji is a gear. Use the `??? task` syntax to create this admonition. Use `!!! task` if you want to have it be permanently expanded. I have add a `+` at the end of the question marks to make this admonition open by default and still enable its collapse. + Use this admonition when providing details on a workflow, task, or tool. + !!! caption - This is a caption. The emoji is a painting. You can create this with the `!!! caption` syntax. This is used to enclose an image in a box and looks nice. A caption can be added beneath the picture and will also look nice. + This is a caption. The emoji is a painting. You can create this with the `!!! caption` syntax. A caption can be added beneath the picture and will also look nice. + + Use this admonition when including images or diagrams in the documentation. !!! techdetails This is where you will put technical details for a workflow task. You can create this by `!!! techdetails` syntax. + Use this admonition when providing technical details for a workflow task or tool. These admonitions should include the following table: + + | | Links | + | --- | --- | + | Task | [link to the task file in the PHB repository on GitHub] | + | Software Source Code | [link to tool's source code] | + | Software Documentation | [link to tool's documentation] | + | Original Publication(s) | [link to tool's publication] | + + If any of these items are unfillable, delete the row. + - Images - Use the following syntax to insert an image: ```markdown @@ -135,7 +113,7 @@ The following language conventions should be followed when writing documentation ![Alt Text](/path/to/image.png) ``` -- Indentation - **_FOUR_** spaces are required instead of the typical two. This is a side effect of using this theme. If you use two spaces, the list and/or indentations will not render correctly. This will make your linter sad :( +- Indentation - **_FOUR_** spaces are required instead of the typical two. This is a side effect of using this theme. If you use two spaces, the list and/or indentations will not render correctly. This will make your linter sad :( ```markdown - first item @@ -160,3 +138,45 @@ The following language conventions should be followed when writing documentation ``` - End all pages with an empty line + +## Documentation Structure + +A brief description of the documentation structure is as follows: + +- `docs/` - Contains the Markdown files for the documentation. + - `assets/` - Contains images and other files used in the documentation. + - `figures/` - Contains images, figures, and workflow diagrams used in the documentation. For workflows that contain many images (such as BaseSpace_Fetch), it is recommended to create a subdirectory for the workflow. + - `files/` - Contains files that are used in the documentation. This may include example outputs or templates. For workflows that contain many files (such as TheiaValidate), it is recommended to create a subdirectory for the workflow. + - `logos/` - Contains Theiagen logos and symbols used in the documentation. + - `metadata_formatters/` - Contains the most up-to-date metadata formatters for our submission workflows. + - `new_workflow_template.md` - A template for adding a new workflow page to the documentation. You can see this template [here](../assets/new_workflow_template.md) + - `contributing/` - Contains the Markdown files for our contribution guides, such as this file + - `javascripts/` - Contains JavaScript files used in the documentation. + - `tablesort.js` - A JavaScript file used to enable table sorting in the documentation. + - `overrides/` - Contains HTMLs used to override theme defaults + - `main.html` - Contains the HTML used to display a warning when the latest version is not selected + - `stylesheets/` - Contains CSS files used in the documentation. + - `extra.css` - A custom CSS file used to style the documentation; contains all custom theme elements (scrollable tables, resizable columns, Theiagen colors), and custom admonitions. + - `workflows/` - Contains the Markdown files for each workflow, organized into subdirectories by workflow category + - `workflows_overview/` - Contains the Markdown files for the overview tables for each display type: alphabetically, by applicable kingdom, and by workflow type. + - `index.md` - The home/landing page for our documentation. + +### Adding a Page for a New Workflow {#new-page} + +If you are adding a new workflow, there are a number of things to do in order to include the page in the documentation: + +1. Add a page with the title of the workflow to appropriate subdirectory in `docs/workflows/`. Feel free to use the template found in the `assets/` folder. +2. Collect the following information for your new workflow: + - Workflow Name - Link the name with a relative path to the workflow page in appropriate `docs/workflows/` subdirectory + - Workflow Description - Brief description of the workflow + - Applicable Kingdom - Options: "Any taxa", "Bacteria", "Mycotics", "Viral" + - Workflow Level (_on Terra_) - Options: "Sample-level", "Set-level", or neither + - Command-line compatibility - Options: "Yes", "No", and/or "Some optional features incompatible" + - The version where the last known changes occurred (likely the upcoming version if it is a new workflow) + - Link to the workflow on Dockstore (if applicable) - Workflow name linked to the information tab on Dockstore. +3. Format this information in a table. +4. Copy the previously gathered information to ==**ALL THREE**== overview tables in `docs/workflows_overview/`: + - `workflows_alphabetically.md` - Add the workflow in the appropriate spot based on the workflow name. + - `workflows_kingdom.md` - Add the workflow in the appropriate spot(s) based on the kingdom(s) the workflow is applicable to. Make sure it is added alphabetically within the appropriate subsection(s). + - `workflows_type.md` - Add the workflow in the appropriate spot based on the workflow type. Make sure it is added alphabetically within the appropriate subsection. +5. Copy the path to the workflow to ==**ALL**== of the appropriate locations in the `mkdocs.yml` file (under the `nav:` section) in the main directory of this repository. These should be the exact same spots as in the overview tables but without additional information. This ensures the workflow can be accessed from the navigation sidebar. diff --git a/docs/index.md b/docs/index.md index 058b2149d..49a4ee157 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,7 +46,7 @@ When undertaking genomic analysis using the command-line, via Terra, or other da We continuously work to improve our codebase and usability of our workflows by the public health community, so changes from version to version are expected. This documentation page reflects the state of the workflow at the version stated in the title. !!! dna "What's new?" - You can see the changes since PHB v2.2.0 [**here**](https://theiagen.notion.site/Public-Health-Bioinformatics-v2-2-1-Patch-Release-Notes-104cb013bc9380bcbd70dab04bf671a8?pvs=74)! + You can see the changes since PHB v2.2.1 [**here**](https://theiagen.notion.site/public-health-bioinformatics-v2-3-0-minor-release-notes?pvs=4)! ## Contributing to the PHB Repository @@ -60,30 +60,32 @@ You can expect a careful review of every PR and feedback as needed before mergin ### Authorship -(Ordered by contribution [# of lines changed] as of 2024-08-01) +(Ordered by contribution [# of lines changed] as of 2024-12-04) - **Sage Wright** ([@sage-wright](https://github.com/sage-wright)) - Conceptualization, Software, Validation, Supervision - **InĂȘs Mendes** ([@cimendes](https://github.com/cimendes)) - Software, Validation - **Curtis Kapsak** ([@kapsakcj](https://github.com/kapsakcj)) - Conceptualization, Software, Validation -- **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - Software, Validation - **Frank Ambrosio** ([@frankambrosio3](https://github.com/frankambrosio3)) - Conceptualization, Software, Validation - **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation - **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision -- **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation +- **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation - **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision +- **Michal Babinski** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation - **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision - **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation -- **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) - Validation - **Joel Sevinsky** ([@sevinsky](https://github.com/sevinsky)) - Conceptualization, Project Administration, Supervision ### External Contributors We would like to gratefully acknowledge the following individuals from the public health community for their contributions to the PHB repository: +- **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) +- **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) - **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) - **Holly Halstead** ([@HNHalstead](https://github.com/HNHalstead)) +- **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) ### On the Shoulder of Giants diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 54a833dfd..0df0d3be2 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -6,8 +6,3 @@ Click here to go to the latest version release. {% endblock %} - - -{% block announce %} -
đŸ—ïž I'm under construction! Pardon the dust while we remodel! đŸ‘·
-{% endblock %} diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index e510ecedc..72b16bc01 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -200,7 +200,6 @@ div.searchable-table input.table-search-input { color: #000; border: 1px solid #E0E1E1; } - [data-md-color-scheme="light"] div.searchable-table input.table-search-input::placeholder { color: #888; font-style: italic; @@ -212,7 +211,6 @@ div.searchable-table input.table-search-input { color: #fff; border: 1px solid #373B40; } - [data-md-color-scheme="slate"] div.searchable-table input.table-search-input::placeholder { color: #bbb; font-style: italic; diff --git a/docs/workflows/genomic_characterization/freyja.md b/docs/workflows/genomic_characterization/freyja.md index f93428521..fc1094204 100644 --- a/docs/workflows/genomic_characterization/freyja.md +++ b/docs/workflows/genomic_characterization/freyja.md @@ -1,16 +1,10 @@ # Freyja Workflow Series -!!! dna inline end "Wastewater and more" - The typical use case of Freyja is to **analyze mixed SARS-CoV-2 samples** from a sequencing dataset, most often **wastewater**. - - !!! warning "Default Values" - The defaults included in the Freyja workflows reflect this use case but **can be adjusted for other pathogens**. See the [**Running Freyja on other pathogens**](freyja.md#running-freyja-on-other-pathogens) section for more information. - ## Quick Facts | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Sample-level, Set-level | +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.3.0 | Yes | Sample-level, Set-level | ## Freyja Overview @@ -21,9 +15,15 @@ Additional post-processing steps can produce visualizations of aggregated samples. +!!! dna "Wastewater and more" + The typical use case of Freyja is to **analyze mixed SARS-CoV-2 samples** from a sequencing dataset, most often **wastewater**. + + !!! warning "Default Values" + The defaults included in the Freyja workflows reflect this use case but **can be adjusted for other pathogens**. See the [**Running Freyja on other pathogens**](freyja.md#running-freyja-on-other-pathogens) section for more information. + !!! caption "Figure 1: Workflow Diagram for Freyja_FASTQ_PHB workflow" ##### Figure 1 { #figure1 } - ![**Figure 1: Workflow diagram for Freyja_FASTQ_PHB workflow.**](../../assets/figures/Freyja_FASTQ.png){width=25%} + ![**Figure 1: Workflow diagram for Freyja_FASTQ_PHB workflow.**](../../assets/figures/Freyja_FASTQ.png){width=100%} Depending on the type of data (Illumina or Oxford Nanopore), the Read QC and Filtering steps, as well as the Read Alignment steps use different software. The user can specify if the barcodes and lineages file should be updated with `freyja update` before running Freyja or if bootstrapping is to be performed with `freyja boot`. @@ -63,7 +63,7 @@ We recommend running this workflow with **"Run inputs defined by file paths"** s | freyja_update | **gcp_uri** | String | The path where you want the Freyja reference files to be stored. Include gs:// at the beginning of the string. Full example with a Terra workspace bucket: "gs://fc-87ddd67a-c674-45a8-9651-f91e3d2f6bb7" | | Required | | freyja_update_refs | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | | freyja_update_refs | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | -| freyja_update_refs | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" | Optional | +| freyja_update_refs | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02" | Optional | | freyja_update_refs | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | | transfer_files | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | | transfer_files | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | @@ -110,13 +110,16 @@ This workflow runs on the sample level. | freyja | **confirmed_only** | Boolean | Include only confirmed SARS-CoV-2 lineages | FALSE | Optional | | freyja | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | | freyja | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | -| freyja | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" | Optional | +| freyja | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02" | Optional | | freyja | **eps** | Float | The minimum lineage abundance cut-off value | 0.001 | Optional | -| freyja | **freyja_lineage_metadata** | File | (found in the optional section, but is required) File containing the lineage metadata; the "curated_lineages.json" file found can be used for this variable. Does not need to be provided if update_db is true. | None | Optional, Required | +| freyja | **freyja_barcodes** | String | Custom barcode file. Does not need to be provided if update_db is true if the freyja_pathogen is provided. | None | Optional | +| freyja | **freyja_lineage_metadata** | File | File containing the lineage metadata; the "curated_lineages.json" file found can be used for this variable. Does not need to be provided if update_db is true or if the freyja_pathogen is provided. | None | Optional, Required | +| freyja | **freyja_pathogen** | String | Pathogen of interest, used if not providing the barcodes and lineage metadata files. Options: SARS-CoV-2, MPXV, H5NX, H1N1pdm, FLU-B-VIC, MEASLESN450, MEASLES, RSVa, RSVb | None | Optional | | freyja | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | | freyja | **number_bootstraps** | Int | The number of bootstraps to perform (only used if bootstrap = true) | 100 | Optional | -| freyja | **update_db** | Boolean | Updates the Freyja reference files (the usher barcodes and lineage metadata files) but will not save them as output (use Freyja_Update for that purpose). If set to true, the `freyja_lineage_metadata` and `freyja_usher_barcodes` files are not required. | FALSE | Optional | +| freyja | **update_db** | Boolean | Updates the Freyja reference files (the usher barcodes and lineage metadata files) but will not save them as output (use Freyja_Update for that purpose). If set to true, the `freyja_lineage_metadata` and `freyja_barcodes` files are not required. | FALSE | Optional | | freyja_fastq | **depth_cutoff** | Int | The minimum coverage depth with which to exclude sites below this value and group identical barcodes | 10 | Optional | +| freyja_fastq | **kraken2_target_organism** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | "Severe acute respiratory syndrome coronavirus 2" | Optional | | freyja_fastq | **ont** | Boolean | Indicates if the input data is derived from an ONT instrument. | FALSE | Optional | | freyja_fastq | **read2** | File | The raw reverse-facing FASTQ file (Illumina only) | | Optional | | freyja_fastq | **trimmomatic_minlen** | Int | The minimum length cut-off when performing read cleaning | 25 | Optional | @@ -363,7 +366,7 @@ The main output file used in subsequent Freyja workflows is found under the `fre | freyja_fastq_wf_version | String | The version of the Public Health Bioinformatics (PHB) repository used | ONT, PE, SE | | freyja_lineage_metadata_file | File | Lineage metadata JSON file used. Can be the one provided as input or downloaded by Freyja if update_db is true | ONT, PE, SE | | freyja_metadata_version | String | Name of lineage metadata file used, or the date if update_db is true | ONT, PE, SE | -| freyja_usher_barcode_file | File | USHER barcode feather file used. Can be the one provided as input or downloaded by Freyja if update_db is true | ONT, PE, SE | +| freyja_barcode_file | File | Barcode feather file used. Can be the one provided as input or downloaded by Freyja if update_db is true | ONT, PE, SE | | freyja_variants | File | The TSV file containing the variants identified by Freyja | ONT, PE, SE | | freyja_version | String | version of Freyja used | ONT, PE, SE | | ivar_version_primtrim | String | Version of iVar for running the iVar trim command | ONT, PE, SE | @@ -371,8 +374,8 @@ The main output file used in subsequent Freyja workflows is found under the `fre | kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | ONT, PE, SE | | kraken_report | File | Full Kraken report | ONT, PE, SE | | kraken_report_dehosted | File | Full Kraken report after host removal | ONT, PE, SE | -| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | ONT, PE, SE | -| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | ONT, PE, SE | +| kraken_sc2 | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software | ONT, PE, SE | +| kraken_sc2_dehosted | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | ONT, PE, SE | | kraken_version | String | Version of Kraken software used | ONT, PE, SE | | minimap2_docker | String | Docker image used to run minimap2 | ONT | | minimap2_version | String | Version of minimap2 used | ONT | @@ -430,7 +433,7 @@ This workflow runs on the set level. | freyja_plot | **collection_date** | Array[String] | An array containing the collection dates for the sample (YYYY-MM-DD format) | | Optional | | freyja_plot_task | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | | freyja_plot_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | -| freyja_plot_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22 | Optional | +| freyja_plot_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02 | Optional | | freyja_plot_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | | freyja_plot_task | **mincov** | Int | The minimum genome coverage used as a cut-off of data to include in the plot | 60 | Optional | | freyja_plot_task | **plot_day_window** | Int | The width of the rolling average window; only used if plot_time_interval is "D" | 14 | Optional | @@ -491,7 +494,7 @@ This workflow runs on the set level. | freyja_dashboard | **dashboard_intro_text** | File | A file containing the text to be contained at the top of the dashboard. | SARS-CoV-2 lineage de-convolution performed by the Freyja workflow (). | Optional | | freyja_dashboard_task | **config** | File | (found in the optional section, but is required) A yaml file that applies various configurations to the dashboard, such as grouping lineages together, applying colorings, etc. See also . | None | Optional, Required | | freyja_dashboard_task | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | -| freyja_dashboard_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22 | Optional | +| freyja_dashboard_task | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02 | Optional | | freyja_dashboard_task | **headerColor** | String | A hex color code to change the color of the header | | Optional | | freyja_dashboard_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | | freyja_dashboard_task | **mincov** | Float | The minimum genome coverage used as a cut-off of data to include in the dashboard. Default is set to 60 by the freyja command-line tool (not a WDL task default, per se) | None | Optional | @@ -531,26 +534,33 @@ This workflow runs on the set level. The main requirement to run Freyja on other pathogens is **the existence of a barcode file for your pathogen of interest**. Currently, barcodes exist for the following organisms -- MEASLES +- SARS-CoV-2 (default) - MPXV +- H5NX +- H1N1pdm +- FLU-B-VIC +- MEASLESN450 +- MEASLES - RSVa - RSVb -The appropriate barcode file and reference sequence need to be downloaded and uploaded to your [Terra.bio](http://Terra.bio) workspace. - !!! warning "Freyja barcodes for other pathogens" Data for various pathogens can be found in the following repository: [Freyja Barcodes](https://github.com/gp201/Freyja-barcodes) Folders are organized by pathogen, with each subfolder named after the date the barcode was generated, using the format YYYY-MM-DD. Barcode files are named `barcode.csv`, and reference genome files are named `reference.fasta`. +The appropriate barcode file and reference sequence need to be downloaded and uploaded to your [Terra.bio](http://Terra.bio) workspace. + + + When running **Freyja_FASTQ_PHB**, the appropriate reference and barcodes file need to be passed as inputs. The first is a required input and will show up at the top of the workflows inputs page on [Terra.bio](http://Terra.bio) ([Figure 2](freyja.md/#figure2)). !!! caption "Figure 2: Required input for Freyja_FASTQ_PHB to provide the reference genome to be used by Freyja" ##### Figure 2 { #figure2 } ![**Figure 2: Required input for Freyja_FASTQ_PHB to provide the reference genome to be used by Freyja.**](../../assets/figures/Freyja_figure2.png) -The barcodes file can be passed directly to Freyja by the `freyja_usher_barcodes` optional input ([Figure 3](freyja.md/#figure3)). +The barcodes file can be passed directly to Freyja by the `freyja_barcodes` optional input ([Figure 3](freyja.md/#figure3)). !!! caption "Figure 3: Optional input for Freyja_FASTQ_PHB to provide the barcodes file to be used by Freyja" ##### Figure 3 {#figure3} diff --git a/docs/workflows/genomic_characterization/pangolin_update.md b/docs/workflows/genomic_characterization/pangolin_update.md index 988db4404..a05756888 100644 --- a/docs/workflows/genomic_characterization/pangolin_update.md +++ b/docs/workflows/genomic_characterization/pangolin_update.md @@ -65,4 +65,8 @@ This workflow runs on the sample level. | **pangolin_updates** | String | Result of Pangolin Update (lineage changed versus unchanged) with lineage assignment and date of analysis | | **pangolin_versions** | String | All Pangolin software and database versions | - \ No newline at end of file + + +## References + +> **Pangolin**: RRambaut A, Holmes EC, O'Toole Á, Hill V, McCrone JT, Ruis C, du Plessis L, Pybus OG. A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology. Nat Microbiol. 2020 Nov;5(11):1403-1407. doi: 10.1038/s41564-020-0770-5. Epub 2020 Jul 15. PMID: 32669681; PMCID: PMC7610519. diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index ffe0993f6..480bfbf04 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes, some optional features incompatible | Sample-level | +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.3.0 | Yes, some optional features incompatible | Sample-level | ## TheiaCoV Workflows @@ -221,14 +221,14 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | ivar_consensus | **stats_n_coverage_primtrim_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | SE,PE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | | kraken2_dehosted | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | | kraken2_dehosted | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | -| kraken2_dehosted | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | -| kraken2_dehosted | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **kraken2_db** | File | The database used to run Kraken2. Must contain viral and human sequences. | "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" | Optional | CL | sars-cov-2 | | kraken2_dehosted | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | | kraken2_dehosted | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | | kraken2_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | | kraken2_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | -| kraken2_raw | **docker_image** | Int | Docker container used in this task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | -| kraken2_raw | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | +| kraken2_raw | **docker_image** | Int | Docker container used in this task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | CL | sars-cov-2 | +| kraken2_raw | **kraken2_db** | File | The database used to run Kraken2. Must contain viral and human sequences. | "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" | Optional | CL | sars-cov-2 | | kraken2_raw | **memory** | String | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | | kraken2_raw | **read_processing** | String | The tool used for trimming of primers from reads. Options are trimmomatic and fastp | trimmomatic | Optional | | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | kraken2_raw | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | @@ -300,8 +300,8 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | qc_check_task | **gambit_predicted_taxon** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **kraken_human** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | | qc_check_task | **kraken_human_dehosted** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | -| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_sc2** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken_sc2_dehosted** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | @@ -341,7 +341,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | read_QC_trim | **call_midas** | Boolean | True/False variable that determines if the MIDAS task should be called. | TRUE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **downsampling_coverage** | Float | The desired coverage to sub-sample the reads to with RASUSA | 150 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **fastp_args** | String | Additional fastp task arguments | --detect_adapter_for_pe -g -5 20 -3 20 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| read_QC_trim | **kraken_db** | File | The database used to run Kraken2 | /kraken2-db | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken_db** | File | The database used to run Kraken2. Must contain viral and human sequences. | "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **midas_db** | File | The database used by the MIDAS task | gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | @@ -481,34 +481,45 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo The following tables include the relevant organism-specific parameters; **all of these default values can be overwritten by providing a value for the "Overwrite Variable Name" field**. ??? toggle "SARS-CoV-2 Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | gene_locations_bed_file | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed"` | | genome_length_input | sars-cov-2 | `29903` | + | kraken_target_organism_input | sars-cov-2 | `"Severe acute respiratory syndrome coronavirus 2"` | | nextclade_dataset_name_input | sars-cov-2 | `"nextstrain/sars-cov-2/wuhan-hu-1/orfs"` | - | nextclade_dataset_tag_input | sars-cov-2 | `"2024-07-17--12-57-03Z"` | - | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 "`| + | nextclade_dataset_tag_input | sars-cov-2 | `"2024-11-19--14-18-53Z"` | + | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.31 "`| | reference_genome | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta"` | | vadr_max_length | sars-cov-2 | `30000` | | vadr_mem | sars-cov-2 | `8` | | vadr_options | sars-cov-2 | `"--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta"` | +
+ ??? toggle "Mpox Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | gene_locations_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed"` | | genome_length_input | MPXV | `197200` | | kraken_target_organism_input | MPXV | `"Monkeypox virus"` | | nextclade_dataset_name_input | MPXV | `"nextstrain/mpox/lineage-b.1"` | - | nextclade_dataset_tag_input | MPXV | `"2024-04-19--07-50-39Z"` | + | nextclade_dataset_tag_input | MPXV | `"2024-11-19--14-18-53Z"` | | primer_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed"` | | reference_genome | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta"` | | reference_gff_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/Mpox-MT903345.1.reference.gff3"` | | vadr_max_length | MPXV | `210000` | | vadr_mem | MPXV | `8` | | vadr_options | MPXV | `"--glsearch -s -r --nomisc --mkey mpxv --r_lowsimok --r_lowsimxd 100 --r_lowsimxl 2000 --alt_pass discontn,dupregin --out_allfasta --minimap2 --s_overhang 150"` | + +
??? toggle "WNV Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| | genome_length_input | WNV | `11000` | | @@ -521,7 +532,11 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | vadr_mem | WNV | `8` | | | vadr_options | WNV | `"--mkey flavi --mdir /opt/vadr/vadr-models-flavi/ --nomisc --noprotid --out_allfasta"` | | +
+ ??? toggle "Flu Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Flu Segment** | **Flu Subtype** | **Default Value** | **Notes** | |---|---|---|---|---|---| | flu_segment | flu | all | all | N/A | TheiaCoV will attempt to automatically assign a flu segment | @@ -531,58 +546,70 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | vadr_mem | flu | all | all | `8` | | | vadr_options | flu | all | all | `"--atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3 --mkey flu"` | | | nextclade_dataset_name_input | flu | ha | h1n1 | `"nextstrain/flu/h1n1pdm/ha/MW626062"` | | - | nextclade_dataset_tag_input | flu | ha | h1n1 | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | ha | h1n1 | `"2024-11-27--02-51-00Z"` | | | reference_genome | flu | ha | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | h3n2 | `"nextstrain/flu/h3n2/ha/EPI1857216"` | | - | nextclade_dataset_tag_input | flu | ha | h3n2 | `"2024-08-08--05-08-21Z"` | | + | nextclade_dataset_tag_input | flu | ha | h3n2 | `"2024-11-27--02-51-00Z"` | | | reference_genome | flu | ha | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | victoria | `"nextstrain/flu/vic/ha/KX058884"` | | - | nextclade_dataset_tag_input | flu | ha | victoria | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | ha | victoria | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | ha | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | yamagata | `"nextstrain/flu/yam/ha/JN993010"` | | | nextclade_dataset_tag_input | flu | ha | yamagata | `"2024-01-30--16-34-55Z"` | | | reference_genome | flu | ha | yamagata | `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | h5n1 | `"community/moncla-lab/iav-h5/ha/all-clades"` | | - | nextclade_dataset_tag_input | flu | ha | h5n1 | `"2024-05-08--11-39-52Z"` | | + | nextclade_dataset_tag_input | flu | ha | h5n1 | `"2024-12-04--17-05-31Z"` | | | reference_genome | flu | ha | h5n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h5n1_ha.fasta"` | | | nextclade_dataset_name_input | flu | na | h1n1 | `"nextstrain/flu/h1n1pdm/na/MW626056"` | | - | nextclade_dataset_tag_input | flu | na | h1n1 | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | na | h1n1 | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.fasta"` | | | nextclade_dataset_name_input | flu | na | h3n2 | `"nextstrain/flu/h3n2/na/EPI1857215"` | | - | nextclade_dataset_tag_input | flu | na | h3n2 | `"2024-04-19--07-50-39Z"` | | + | nextclade_dataset_tag_input | flu | na | h3n2 | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.fasta"` | | | nextclade_dataset_name_input | flu | na | victoria | `"nextstrain/flu/vic/na/CY073894"` | | - | nextclade_dataset_tag_input | flu | na | victoria | `"2024-04-19--07-50-39Z"` | | + | nextclade_dataset_tag_input | flu | na | victoria | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_na.fasta"` | | | nextclade_dataset_name_input | flu | na | yamagata | `"NA"` | | | nextclade_dataset_tag_input | flu | na | yamagata | `"NA"` | | | reference_genome | flu | na | yamagata | `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_na.fasta"` | | +
+ ??? toggle "RSV-A Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_a | 16000 | - | kraken_target_organism | rsv_a | Respiratory syncytial virus | + | kraken_target_organism | rsv_a | "Human respiratory syncytial virus A" | | nextclade_dataset_name_input | rsv_a | nextstrain/rsv/a/EPI_ISL_412866 | - | nextclade_dataset_tag_input | rsv_a | 2024-08-01--22-31-31Z | + | nextclade_dataset_tag_input | rsv_a | "2024-11-27--02-51-00Z" | | reference_genome | rsv_a | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta | | vadr_max_length | rsv_a | 15500 | | vadr_mem | rsv_a | 32 | | vadr_options | rsv_a | -r --mkey rsv --xnocomp | +
+ ??? toggle "RSV-B Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_b | 16000 | - | kraken_target_organism | rsv_b | "Human orthopneumovirus" | + | kraken_target_organism | rsv_b | "human respiratory syncytial virus" | | nextclade_dataset_name_input | rsv_b | nextstrain/rsv/b/EPI_ISL_1653999 | - | nextclade_dataset_tag_input | rsv_b | "2024-08-01--22-31-31Z" | + | nextclade_dataset_tag_input | rsv_b | "2024-11-27--02-51-00Z" | | reference_genome | rsv_b | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta | | vadr_max_length | rsv_b | 15500 | | vadr_mem | rsv_b | 32 | | vadr_options | rsv_b | -r --mkey rsv --xnocomp | +
+ ??? toggle "HIV Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| | kraken_target_organism_input | HIV | Human immunodeficiency virus 1 | | @@ -595,6 +622,8 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | reference_genome | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.headerchanged.fasta | This version of HIV originates from Southern Africa | | reference_gff_file | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.gff3 | This version of HIV originates from Southern Africa | +
+ ### Workflow Tasks All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT, and ClearLabs workflows. These undertake read trimming and assembly appropriate to the input data type. TheiaCoV workflows subsequently launch default genome characterization modules for quality assessment, and additional taxa-specific characterization steps. When setting up the workflow, users may choose to use "optional tasks" as additions or alternatives to tasks run in the workflow by default. @@ -630,8 +659,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | Variable | Rationale | | --- | --- | - | `skip_screen` | Prevent the read screen from running | - | `skip_screen` | Saving waste of compute resources on insufficient data | + | `skip_screen` | Set to true to skip the read screen from running | | `min_reads` | Minimum number of base pairs for 10x coverage of the Hepatitis delta (of the *Deltavirus* genus) virus divided by 300 (longest Illumina read length) | | `min_basepairs` | Greater than 10x coverage of the Hepatitis delta (of the *Deltavirus* genus) virus | | `min_genome_size` | Based on the Hepatitis delta (of the *Deltavirus* genus) genome- the smallest viral genome as of 2024-04-11 (1,700 bp) | @@ -699,7 +727,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow !!! info "Database-dependent" - TheiaCoV automatically uses a viral-specific Kraken2 database. + TheiaCoV automatically uses a viral-specific Kraken2 database. This database was generated in-house from RefSeq's viral sequence collection and human genome GRCh38. It's available at `gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz` !!! techdetails "Kraken2 Technical Details" @@ -714,7 +742,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilties/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl)
[wf_read_QC_trim_se.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_se.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | @@ -734,7 +762,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | | Links | | --- | --- | - | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_ncbi_scrub.wdl) | + | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_ncbi_scrub.wdl) | | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber) | | Software Documentation | | @@ -749,13 +777,13 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow !!! info "Database-dependent" - TheiaCoV automatically uses a viral-specific Kraken2 database. + TheiaCoV automatically uses a viral-specific Kraken2 database. This database was generated in-house from RefSeq's viral sequence collection and human genome GRCh38. It's available at `gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz` !!! techdetails "Kraken2 Technical Details" | | Links | | --- | --- | - | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | @@ -900,6 +928,8 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | Task | [task_pangolin.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/betacoronavirus/task_pangolin.wdl) | | Software Source Code | [Pangolin on GitHub](https://github.com/cov-lineages/pangolin) | | Software Documentation | [Pangolin website](https://cov-lineages.org/resources/pangolin.html) | + | Original Publication(s) | [A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology](https://doi.org/10.1038/s41564-020-0770-5) | + ??? task "`nextclade`" @@ -1093,8 +1123,8 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | CL, ONT, PE | | kraken_report | File | Full Kraken report | CL, ONT, PE, SE | | kraken_report_dehosted | File | Full Kraken report after host removal | CL, ONT, PE | -| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | CL, ONT, PE, SE | -| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken_sc2 | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken_sc2_dehosted | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | CL, ONT, PE | | kraken_target_organism | String | Percent of target organism read data detected using the Kraken2 software | CL, ONT, PE, SE | | kraken_target_organism_dehosted | String | Percent of target organism read data detected using the Kraken2 software after host removal | CL, ONT, PE | | kraken_target_organism_name | String | The name of the target organism; e.g., "Monkeypox" or "Human immunodeficiency virus" | CL, ONT, PE, SE | @@ -1142,10 +1172,10 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | nextclade_json_flu_ha | File | Nextclade output in JSON file format, specific to Flu HA segment | ONT, PE | | nextclade_json_flu_na | File | Nextclade output in JSON file format, specific to Flu NA segment | ONT, PE | | nextclade_lineage | String | Nextclade lineage designation | CL, FASTA, ONT, PE, SE | -| nextclade_qc | String | QC metric as determined by Nextclade. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | +| nextclade_qc | String | QC metric as determined by Nextclade. Will be blank for Flu | CL, FASTA, ONT, PE, SE | | nextclade_qc_flu_ha | String | QC metric as determined by Nextclade, specific to Flu HA segment | ONT, PE | | nextclade_qc_flu_na | String | QC metric as determined by Nextclade, specific to Flu NA segment | ONT, PE | -| nextclade_tsv | File | Nextclade output in TSV file format. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | +| nextclade_tsv | File | Nextclade output in TSV file format. Will be blank for Flu | CL, FASTA, ONT, PE, SE | | nextclade_tsv_flu_ha | File | Nextclade output in TSV file format, specific to Flu HA segment | ONT, PE | | nextclade_tsv_flu_na | File | Nextclade output in TSV file format, specific to Flu NA segment | ONT, PE | | nextclade_version | String | The version of Nextclade software used | CL, FASTA, ONT, PE, SE | diff --git a/docs/workflows/genomic_characterization/theiaeuk.md b/docs/workflows/genomic_characterization/theiaeuk.md index bedeac0cf..bdfeb5d81 100644 --- a/docs/workflows/genomic_characterization/theiaeuk.md +++ b/docs/workflows/genomic_characterization/theiaeuk.md @@ -4,26 +4,26 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibliity** | **Workflow Level** | |---|---|---|---|---| -| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Mycotics](../../workflows_overview/workflows_kingdom.md/#mycotics) | PHB v2.2.0 | Yes | Sample-level | +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Mycotics](../../workflows_overview/workflows_kingdom.md/#mycotics) | PHB v2.3.0 | Yes | Sample-level | ## TheiaEuk Workflows -**The TheiaEuk_PE workflow is for the assembly, quality assessment, and characterization of fungal genomes.** It is designed to accept Illumina paired-end sequencing data as the primary input. **It is currently intended only for haploid fungal genomes like _Candida auris_.** Analyzing diploid genomes using TheiaEuk should be attempted only with expert attention to the resulting genome quality. +**The TheiaEuk_Illumina_PE workflow is for the assembly, quality assessment, and characterization of fungal genomes.** It is designed to accept Illumina paired-end sequencing data as the primary input. **It is currently intended only for ==haploid== fungal genomes like _Candida auris_.** Analyzing diploid genomes using TheiaEuk should be attempted only with expert attention to the resulting genome quality. -All input reads are processed through "core tasks" in each workflow. The core tasks include raw-read quality assessment, read cleaning (quality trimming and adapter removal), de novo assembly, assembly quality assessment, and species taxon identification. For some taxa identified, "taxa-specific sub-workflows" will be automatically activated, undertaking additional taxa-specific characterization steps, including clade-typing and/or antifungal resistance detection. +All input reads are processed through "core tasks" in each workflow. The core tasks include raw read quality assessment, read cleaning (quality trimming and adapter removal), de novo assembly, assembly quality assessment, and species taxon identification. For some taxa identified, taxa-specific sub-workflows will be automatically activated, undertaking additional taxa-specific characterization steps, including clade-typing and/or antifungal resistance detection. !!! caption "TheiaEuk Workflow Diagram" - ![TheiaEuk Workflow Diagram](../../assets/figures/TheiaEuk_Illumina_PE.png){width=75%} + ![TheiaEuk Workflow Diagram](../../assets/figures/TheiaEuk_Illumina_PHB_20241106.png){width=75%} ### Inputs !!! info "Input read data" - The TheiaEuk_PE workflow takes in Illumina paired-end read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) prior to Terra upload to minimize data upload time. + The TheiaEuk_Illumina_PE workflow takes in Illumina paired-end read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) prior to Terra upload to minimize data upload time. By default, the workflow anticipates 2 x 150bp reads (i.e. the input reads were generated using a 300-cycle sequencing kit). Modifications to the optional parameter for `trim_minlen` may be required to accommodate shorter read data, such as the 2 x 75bp reads generated using a 150-cycle sequencing kit. -
+
| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| @@ -69,7 +69,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta | merlin_magic | **staphopia_sccmec_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0 | Do Not Modify, Optional | | merlin_magic | **tbp_parser_coverage_threshold** | Int | Internal component, do not modify | 100 | Do Not Modify, Optional | | merlin_magic | **tbp_parser_debug** | Boolean | Internal component, do not modify | FALSE | Do Not Modify, Optional | -| merlin_magic | **tbp_parser_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.3.6 | Do Not Modify, Optional | +| merlin_magic | **tbp_parser_docker_image** | String | Internal component, do not modify | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.2 | Do Not Modify, Optional | | merlin_magic | **tbp_parser_min_depth** | Int | Internal component, do not modify | 10 | Do Not Modify, Optional | | merlin_magic | **tbp_parser_operator** | String | Internal component, do not modify | "Operator not provided" | Do Not Modify, Optional | | merlin_magic | **tbp_parser_output_seq_method_type** | String | Internal component, do not modify | "WGS" | Do Not Modify, Optional | @@ -148,7 +148,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta | read_QC_trim | **workflow_series** | String | Internal component, do not modify | | Do Not Modify, Optional | | shovill_pe | **assembler** | String | Assembler to use (spades, skesa, velvet or megahit), see | "skesa" | Optional | | shovill_pe | **assembler_options** | String | Assembler-specific options that you might choose, see | | Optional | -| shovill_pe | **depth** | Int | User specified depth of coverage for downsampling (see ) | 150 | Optional | +| shovill_pe | **depth** | Int | User specified depth of coverage for downsampling (see and ) | 150 | Optional | | shovill_pe | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | shovill_pe | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/shovill:1.1.0 | Optional | | shovill_pe | **genome_length** | String | Internal component, do not modify | | Do Not Modify, Optional | @@ -177,7 +177,14 @@ All input reads are processed through "core tasks" in each workflow. The core ta
-### Workflow tasks (performed for all taxa) +### Workflow Tasks + +All input reads are processed through "core tasks" in the TheiaEuk workflows. These undertake read trimming and assembly appropriate to the input data type, currently only Illumina paired-end data. TheiaEuk workflow subsequently launch default genome characterization modules for quality assessment, and additional taxa-specific characterization steps. When setting up the workflow, users may choose to use "optional tasks" or alternatives to tasks run in the workflow by default. + +#### Core tasks + +!!! tip "" + These tasks are performed regardless of organism. They perform read trimming and various quality control steps. ??? task "`versioning`: Version capture for TheiaEuk" @@ -189,7 +196,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta | --- | --- | | Task | [task_versioning.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/task_versioning.wdl) | -??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation" +??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation (optional, on by default)" The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses [`fastq-scan`](https://github.com/rpetit3/fastq-scan) and bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow: @@ -219,19 +226,22 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_screen.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_screen.wdl) | + | Task | [task_screen.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) | -??? task "`rasusa`: Read subsampling" +??? task "`Rasusa`: Read subsampling (optional, on by default)" - The RASUSA task performs subsampling of the raw reads. By default, this task will subsample reads to a depth of 150X using the estimated genome length produced during the preceding raw read screen. The user can prevent the task from being launched by setting the `call_rasusa`variable to false. + The Rasusa task performs subsampling of the raw reads. By default, this task will subsample reads to a depth of 150X using the estimated genome length produced during the preceding raw read screen. The user can prevent the task from being launched by setting the `call_rasusa`variable to false. The user can also provide an estimated genome length for the task to use for subsampling using the `genome_size` variable. In addition, the read depth can be modified using the `subsample_coverage` variable. - !!! techdetails "RASUSA Technical Details" + !!! techdetails "Rasusa Technical Details" - | | TheiaEuk_Illumina_PE_PHB | + | | Links | | --- | --- | | Task | [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) | + | Software Source Code | [Rasusa on GitHub](https://github.com/mbhall88/rasusa) | + | Software Documentation | [Rasusa on GitHub](https://github.com/mbhall88/rasusa) | + | Original Publication(s) | [Rasusa: Randomly subsample sequencing reads to a specified coverage](https://doi.org/10.21105/joss.03941) | ??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" @@ -297,12 +307,17 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl)| | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | | Original Publication(s) | [Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | +#### Assembly tasks + +!!! tip "" + These tasks assemble the reads into a _de novo_ assembly and assess the quality of the assembly. + ??? task "`shovill`: _De novo_ Assembly" De Novo assembly will be undertaken only for samples that have sufficient read quantity and quality, as determined by the `screen` task assessment of clean reads. @@ -316,7 +331,8 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | | TheiaEuk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_shovill.wdl#L3) | - | Software code repository and documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | Software Source Code | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | Software Documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | ??? task "`QUAST`: Assembly Quality Assessment" @@ -326,7 +342,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_quast.wdl) | | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | | Software Documentation | https://quast.sourceforge.net/docs/manual.html | | Orginal publication | [QUAST: quality assessment tool for genome assemblies](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3624806/) | @@ -340,11 +356,16 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_cg_pipeline.wdl) | + | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_cg_pipeline.wdl) | | Software Source Code | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Software Documentation | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Original Publication(s) | [A computational genomics pipeline for prokaryotic sequencing projects](https://academic.oup.com/bioinformatics/article/26/15/1819/188418) | +#### Organism-agnostic characterization + +!!! tip "" + These tasks are performed regardless of the organism and provide quality control and taxonomic assignment. + ??? task "`GAMBIT`: **Taxon Assignment**" [`GAMBIT`](https://github.com/jlumpe/gambit) determines the taxon of the genome assembly using a k-mer based approach to match the assembly sequence to the closest complete genome in a database, thereby predicting its identity. Sometimes, GAMBIT can confidently designate the organism to the species level. Other times, it is more conservative and assigns it to a higher taxonomic rank. @@ -360,7 +381,33 @@ All input reads are processed through "core tasks" in each workflow. The core ta | Software Documentation | [GAMBIT ReadTheDocs](https://gambit-genomics.readthedocs.io/en/latest/) | | Original Publication(s) | [GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0277575) | -??? task "**`QC_check`: Check QC Metrics Against User-Defined Thresholds (optional)**" +??? task "`BUSCO`: Assembly Quality Assessment" + + BUSCO (**B**enchmarking **U**niversal **S**ingle-**C**opy **O**rthologue) attempts to quantify the completeness and contamination of an assembly to generate quality assessment metrics. It uses taxa-specific databases containing genes that are all expected to occur in the given taxa, each in a single copy. BUSCO examines the presence or absence of these genes, whether they are fragmented, and whether they are duplicated (suggestive that additional copies came from contaminants). + + **BUSCO notation** + + Here is an example of BUSCO notation: `C:99.1%[S:98.9%,D:0.2%],F:0.0%,M:0.9%,n:440`. There are several abbreviations used in this output: + + - Complete (C) - genes are considered "complete" when their lengths are within two standard deviations of the BUSCO group mean length. + - Single-copy (S) - genes that are complete and have only one copy. + - Duplicated (D) - genes that are complete and have more than one copy. + - Fragmented (F) - genes that are only partially recovered. + - Missing (M) - genes that were not recovered at all. + - Number of genes examined (n) - the number of genes examined. + + A high equity assembly will use the appropriate database for the taxa, have high complete (C) and single-copy (S) percentages, and low duplicated (D), fragmented (F) and missing (M) percentages. + + !!! techdetails "BUSCO Technical Details" + + | | Links | + | --- | --- | + | Task | [task_busco.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/advanced_metrics/task_busco.wdl) | + | Software Source Code | [BUSCO on GitLab](https://gitlab.com/ezlab/busco) | + | Software Documentation | https://busco.ezlab.org/ | + | Orginal publication | [BUSCO: assessing genome assembly and annotation completeness with single-copy orthologs](https://academic.oup.com/bioinformatics/article/31/19/3210/211866) | + +??? task "`QC_check`: Check QC Metrics Against User-Defined Thresholds (optional)" The `qc_check` task compares generated QC metrics against user-defined thresholds for each metric. This task will run if the user provides a `qc_check_table` .tsv file. If all QC metrics meet the threshold, the `qc_check` output variable will read `QC_PASS`. Otherwise, the output will read `QC_NA` if the task could not proceed or `QC_ALERT` followed by a string indicating what metric failed. @@ -383,96 +430,167 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_qc_check.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_qc_check.wdl) | + | Task | [task_qc_check_phb.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_qc_check_phb.wdl) | -### Organism-specific Characterization +#### Organism-specific characterization -The TheiaEuk workflow automatically activates taxa-specific tasks after identification of relevant taxa using `GAMBIT`. Many of these taxa-specific tasks do not require any additional workflow tasks from the user. +!!! tip "" + The TheiaEuk workflow automatically activates taxa-specific tasks after identification of the relevant taxa using `GAMBIT`. Many of these taxa-specific tasks do not require any additional inputs from the user. ??? toggle "_Candida auris_" + Two tools are deployed when _Candida auris_ is identified. + + ??? task "Cladetyping: clade determination" + GAMBIT is used to determine the clade of the specimen by comparing the sequence to five clade-specific reference files. The output of the clade typing task will be used to specify the reference genome for the antifungal resistance detection tool. + + ??? toggle "Default reference genomes used for clade typing and antimicrobial resistance gene detection of _C. auris_" + | Clade | Genome Accession | Assembly Name | Strain | NCBI Submitter | Included mutations in AMR genes (not comprehensive) | + | --- | --- | --- | --- | --- | --- | + | _Candida auris_ Clade I | GCA_002759435.2 | Cand_auris_B8441_V2 | B8441 | Centers for Disease Control and Prevention | | + | _Candida auris_ Clade II | GCA_003013715.2 | ASM301371v2 | B11220 | Centers for Disease Control and Prevention | | + | _Candida auris_ Clade III | GCA_002775015.1 | Cand_auris_B11221_V1 | B11221 | Centers for Disease Control and Prevention | _ERG11_ V125A/F126L | + | _Candida auris_ Clade IV | GCA_003014415.1 | Cand_auris_B11243 | B11243 | Centers for Disease Control and Prevention | _ERG11_ Y132F | + | _Candida auris_ Clade V | GCA_016809505.1 | ASM1680950v1 | IFRC2087 | Centers for Disease Control and Prevention | | + + !!! techdetails "Cladetyping Technical Details" + | | Links | + | --- | --- | + | Task | [task_cauris_cladetyping.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/candida/task_cauris_cladetyper.wdl) | + | Software Source Code | [GAMBIT on GitHub](https://github.com/jlumpe/gambit) | + | Software Documentation | [GAMBIT Overview](https://theiagen.notion.site/GAMBIT-7c1376b861d0486abfbc316480046bdc?pvs=4) + | Original Publication(s) | [GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification](https://doi.org/10.1371/journal.pone.0277575)
[TheiaEuk: a species-agnostic bioinformatics workflow for fungal genomic characterization](https://doi.org/10.3389/fpubh.2023.1198213) | + + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, then these variants are queried for product names associated with resistance. + + The genes in which there are known resistance-conferring mutations for this pathogen are: + + - FKS1 + - ERG11 (lanosterol 14-alpha demethylase) + - FUR1 (uracil phosphoribosyltransferase) + + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): + + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | B9J08_005340 | ERG6 | + | B9J08_000401 | FLO8 | + | B9J08_005343 | Hypothetical protein (PSK74852) | + | B9J08_003102 | MEC3 | + | B9J08_003737 | ERG3 | + | lanosterol.14-alpha.demethylase | ERG11 | + | uracil.phosphoribosyltransferase | FUR1 | + | FKS1 | FKS1 | + + For example, one sample may have the following output for the `theiaeuk_snippy_variants_hits` column: + + ```plaintext + lanosterol.14-alpha.demethylase: lanosterol 14-alpha demethylase (missense_variant c.428A>G p.Lys143Arg; C:266 T:0),B9J08_000401: hypothetical protein (stop_gained c.424C>T p.Gln142*; A:70 G:0) + ``` + + Based on this, we can tell that ERG11 has a missense variant at position 143 (Lysine to Arginine) and B9J08_000401 (which is FLO8) has a stop-gained variant at position 142 (Glutamine to Stop). + + ??? toggle "Known resistance-conferring mutations for _Candida auris_" + Mutations in these genes that are known to confer resistance are shown below + + | **Organism** | **Found in** | **Gene name** | **Gene locus** | **AA mutation** | **Drug** | **Reference** | + | --- | --- | --- | --- | --- | --- | --- | + | **Candida auris** | **Human** | **ERG11** | | **Y132F** | **Fluconazole** | [Simultaneous Emergence of Multidrug-Resistant _Candida auris_ on 3 Continents Confirmed by Whole-Genome Sequencing and Epidemiological Analyses](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **ERG11** | | **K143R** | **Fluconazole** | [Simultaneous Emergence of Multidrug-Resistant _Candida auris_ on 3 Continents Confirmed by Whole-Genome Sequencing and Epidemiological Analyses](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **ERG11** | | **F126T** | **Fluconazole** | [Simultaneous Emergence of Multidrug-Resistant _Candida auris_ on 3 Continents Confirmed by Whole-Genome Sequencing and Epidemiological Analyses](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Micafungin** | [Activity of CD101, a long-acting echinocandin, against clinical isolates of Candida auris](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Caspofungin** | [Activity of CD101, a long-acting echinocandin, against clinical isolates of Candida auris](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Anidulafungin** | [Activity of CD101, a long-acting echinocandin, against clinical isolates of Candida auris](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Micafungin** | [A multicentre study of antifungal susceptibility patterns among 350 _Candida auris_ isolates (2009–17) in India: role of the ERG11 and FKS1 genes in azole and echinocandin resistance](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Caspofungin** | [A multicentre study of antifungal susceptibility patterns among 350 _Candida auris_ isolates (2009–17) in India: role of the ERG11 and FKS1 genes in azole and echinocandin resistance](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Anidulafungin** | [A multicentre study of antifungal susceptibility patterns among 350 _Candida auris_ isolates (2009–17) in India: role of the ERG11 and FKS1 genes in azole and echinocandin resistance](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FUR1** | **CAMJ_004922** | **F211I** | **5-flucytosine** | [Genomic epidemiology of the UK outbreak of the emerging human fungal pathogen Candida auris](https://doi.org/10.1038/s41426-018-0045-x) | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | + +??? toggle "_Candida albicans_" + When this species is detected by the taxon ID tool, an antifungal resistance detection task is deployed. - Two tools are deployed when _Candida auris is_ identified. First, the Cladetyping tool is launched to determine the clade of the specimen by comparing the sequence to five clade-specific reference files. The output of the clade typing task will be used to specify the reference genome for the antifungal resistance detection tool. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, then these variants are queried for product names associated with resistance according to the MARDy database (). - - The genes in which there are known resistance-conferring mutations for this pathogen are: - - - FKS1 - - ERG11 (lanosterol 14-alpha demethylase) - - FUR1 (uracil phosphoribosyltransferase) - - We query `Snippy` results to see if any mutations were identified in those genes. In addition, _C. auris_ automatically checks for the following loci. You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name followings: - - | **TheiaEuk Search Term** | **Corresponding Gene Name** | - |---|---| - | B9J08_005340 | ERG6 | - | B9J08_000401 | FLO8 | - | B9J08_005343 | Hypothetical protein (PSK74852) | - | B9J08_003102 | MEC3 | - | B9J08_003737 | ERG3 | - | lanosterol.14-alpha.demethylase | ERG11 | - | uracil.phosphoribosyltransferase | FUR1 | - | FKS1 | FKS1 | - - For example, one sample may have the following output for the `theiaeuk_snippy_variants_hits` column: - - ```plaintext - lanosterol.14-alpha.demethylase: lanosterol 14-alpha demethylase (missense_variant c.428A>G p.Lys143Arg; C:266 T:0),B9J08_000401: hypothetical protein (stop_gained c.424C>T p.Gln142*; A:70 G:0) - ``` - - Based on this, we can tell that ERG11 has a missense variant at position 143 (Lysine to Arginine) and B9J08_000401 (which is FLO8) has a stop-gained variant at position 142 (Glutamine to Stop). - - ??? toggle "Default reference genomes used for clade typing and antimicrobial resistance gene detection of _C. auris_" - | Clade | Genome Accession | Assembly Name | Strain | NCBI Submitter | Included mutations in AMR genes (not comprehensive) | - | --- | --- | --- | --- | --- | --- | - | Candida auris Clade I | GCA_002759435.2 | Cand_auris_B8441_V2 | B8441 | Centers for Disease Control and Prevention | | - | Candida auris Clade II | GCA_003013715.2 | ASM301371v2 | B11220 | Centers for Disease Control and Prevention | | - | Candida auris Clade III | GCA_002775015.1 | Cand_auris_B11221_V1 | B11221 | Centers for Disease Control and Prevention | _ERG11_ V125A/F126L | - | Candida auris Clade IV | GCA_003014415.1 | Cand_auris_B11243 | B11243 | Centers for Disease Control and Prevention | _ERG11_ Y132F | - | Candida auris Clade V | GCA_016809505.1 | ASM1680950v1 | IFRC2087 | Centers for Disease Control and Prevention | | - - ??? toggle "Known resistance-conferring mutations for _Candida auris_" - Mutations in these genes that are known to confer resistance are shown below (source: MARDy database http://mardy.dide.ic.ac.uk/index.php) - - | **Organism** | **Found in** | **Gene name** | **Gene locus** | **AA mutation** | **Drug** | **Tandem repeat name** | **Tandem repeat sequence** | **Reference** | - | --- | --- | --- | --- | --- | --- | --- | --- | --- | - | **Candida auris** | **Human** | **ERG11** | | **Y132F** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | - | **Candida auris** | **Human** | **ERG11** | | **K143R** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | - | **Candida auris** | **Human** | **ERG11** | | **F126T** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | - | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Micafungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | - | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Caspofungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | - | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Anidulafungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | - | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Micafungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | - | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Caspofungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | - | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Anidulafungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | - | **Candida auris** | **Human** | **FUR1** | **CAMJ_004922** | **F211I** | **5-flucytosine** | | | [**https://doi.org/10.1038/s41426-018-0045-x**](https://www.nature.com/articles/s41426-018-0045-x) | + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance. -??? toggle "_Candida albicans_" + The genes in which there are known resistance-conferring mutations for this pathogen are: - When this species is detected by the taxon ID tool, an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + - ERG11 + - GCS1 (FKS1) + - FUR1 + - RTA2 - The genes in which there are known resistance-conferring mutations for this pathogen are: + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): - - ERG11 - - GCS1 (FKS1) - - FUR1 - - RTA2 + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | ERG11 | ERG11 | + | GCS1 | FKS1 | + | FUR1 | FUR1 | + | RTA2 | RTA2 | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ??? toggle "_Aspergillus fumigatus_" + When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. + + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance. - When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + The genes in which there are known resistance-conferring mutations for this pathogen are: - The genes in which there are known resistance-conferring mutations for this pathogen are: + - Cyp51A + - HapE + - COX10 (AFUA_4G08340) + + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): - - Cyp51A - - HapE - - COX10 (AFUA_4G08340) + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | Cyp51A | Cyp51A | + | HapE | HapE | + | AFUA_4G08340 | COX10 | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ??? toggle "_Cryptococcus neoformans_" + When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. - When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance. - The gene in which there are known resistance-conferring mutations for this pathogen is: + The genes in which there are known resistance-conferring mutations for this pathogen are: - - ERG11 (CNA00300) + - ERG11 (CNA00300) + + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): + + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | CNA00300 | ERG11 | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Outputs @@ -540,4 +658,4 @@ The TheiaEuk workflow automatically activates taxa-specific tasks after identifi | theiaeuk_illumina_pe_analysis_date | String | Date of TheiaProk workflow execution | | theiaeuk_illumina_pe_version | String | TheiaProk workflow version used | -
\ No newline at end of file + diff --git a/docs/workflows/genomic_characterization/theiameta.md b/docs/workflows/genomic_characterization/theiameta.md index 6e9147399..fad3c359a 100644 --- a/docs/workflows/genomic_characterization/theiameta.md +++ b/docs/workflows/genomic_characterization/theiameta.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.2.0 | Yes | Sample-level | +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.3.0 | Yes | Sample-level | ## TheiaMeta Workflows @@ -149,7 +149,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_ncbi_scrub.wdl) | + | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_ncbi_scrub.wdl) | | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber) | | Software Documentation | | @@ -214,7 +214,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl)
[wf_read_QC_trim_se.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_se.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl)| | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | @@ -233,7 +233,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | @@ -242,21 +242,62 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge ??? task "`metaspades`: _De Novo_ Metagenomic Assembly" - While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + + `metaspades` is a _de novo_ assembler that first constructs a de Bruijn graph of all the reads using the SPAdes algorithm. Through various graph simplification procedures, paths in the assembly graph are reconstructed that correspond to long genomic fragments within the metagenome. For more details, please see the original publication. !!! techdetails "MetaSPAdes Technical Details" | | Links | | --- | --- | | Task | [task_metaspades.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_metaspades.wdl) | - | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | - | Software Documentation | | - | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5411777/) | + | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | + | Software Documentation | [SPAdes Manual](https://ablab.github.io/spades/index.html) | + | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](http://www.genome.org/cgi/doi/10.1101/gr.213959.116) | -??? task "`minimap2`: Assembly Alignment and Contig Filtering (if a reference is provided)" +??? task "`minimap2`: Assembly Alignment and Contig Filtering" If a reference genome is provided through the **`reference`** optional input, the assembly produced with `metaspades` will be mapped to the reference genome with `minimap2`. The contigs which align to the reference are retrieved and returned in the **`assembly_fasta`** output. + `minimap2` is a popular aligner that is used for correcting the assembly produced by metaSPAdes. This is done by aligning the reads back to the generated assembly or a reference genome. + + In minimap2, "modes" are a group of preset options. Two different modes are used in this task depending on whether a reference genome is provided. + + If a reference genome is _not_ provided, the only mode used in this task is `sr` which is intended for "short single-end reads without splicing". The `sr` mode indicates the following parameters should be used: `-k21 -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g100 -2K50m --heap-sort=yes --secondary=no`. The output file is in SAM format. + + If a reference genome is provided, then after the draft assembly polishing with `pilon`, this task runs again with the mode set to `asm20` which is intended for "long assembly to reference mapping". The `asm20` mode indicates the following parameters should be used: `-k19 -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50`. The output file is in PAF format. + + For more information, please see the [minimap2 manpage](https://lh3.github.io/minimap2/minimap2.html) + + !!! techdetails "minimap2 Technical Details" + | | Links | + |---|---| + | Task | [task_minimap2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_minimap2.wdl) | + | Software Source Code | [minimap2 on GitHub](https://github.com/lh3/minimap2) | + | Software Documentation | [minimap2](https://lh3.github.io/minimap2) | + | Original Publication(s) | [Minimap2: pairwise alignment for nucleotide sequences](https://academic.oup.com/bioinformatics/article/34/18/3094/4994778) | + +??? task "`samtools`: SAM File Conversion " + This task converts the output SAM file from minimap2 and converts it to a BAM file. It then sorts the BAM based on the read names, and then generates an index file. + + !!! techdetails "samtools Technical Details" + | | Links | + |---|---| + | Task | [task_samtools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_handling/task_parse_mapping.wdl) | + | Software Source Code | [samtools on GitHub](https://github.com/samtools/samtools) | + | Software Documentation | [samtools](https://www.htslib.org/doc/samtools.html) | + | Original Publication(s) | [The Sequence Alignment/Map format and SAMtools](https://doi.org/10.1093/bioinformatics/btp352)
[Twelve Years of SAMtools and BCFtools](https://doi.org/10.1093/gigascience/giab008) | + +??? task "`pilon`: Assembly Polishing" + `pilon` is a tool that uses read alignment to correct errors in an assembly. It is used to polish the assembly produced by metaSPAdes. The input to Pilon is the sorted BAM file produced by `samtools`, and the original draft assembly produced by `metaspades`. + + !!! techdetails "pilon Technical Details" + | | Links | + |---|---| + | Task | [task_pilon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_pilon.wdl) | + | Software Source Code | [Pilon on GitHub](https://github.com/broadinstitute/pilon) | + | Software Documentation | [Pilon Wiki](https://github.com/broadinstitute/pilon/wiki) | + | Original Publication(s) | [Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement](https://doi.org/10.1371/journal.pone.0112963) | #### Assembly QC ??? task "`quast`: Assembly Quality Assessment" @@ -267,7 +308,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_quast.wdl) | | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | | Software Documentation | | | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 6664df6df..41e11c51b 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.2.0 | Yes, some optional features incompatible | Sample-level | +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.3.0 | Yes, some optional features incompatible | Sample-level | ## TheiaProk Workflows @@ -78,6 +78,12 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | *workflow name | **originating_lab** | String | Will be used in the "originating_lab" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | | *workflow name | **perform_characterization** | Boolean | Set to "false" if you want to only generate an assembly and relevant QC metrics and skip all characterization tasks | TRUE | Optional | FASTA, ONT, PE, SE | | *workflow name | **qc_check_table** | File | TSV value with taxons for rows and QC values for columns; internal cells represent user-determined QC thresholds; if provided, turns on the QC Check task.
Click on the variable name for an example QC Check table | | Optional | FASTA, ONT, PE, SE | +| *workflow name | **read1_lane2** | File | If provided, the Concatenate_Illumina_Lanes subworkflow will concatenate all files from the same lane before doing any subsequent analysis | | Optional | PE, SE | +| *workflow name | **read1_lane3** | File | If provided, the Concatenate_Illumina_Lanes subworkflow will concatenate all files from the same lane before doing any subsequent analysis | | Optional | PE, SE | +| *workflow name | **read1_lane4** | File | If provided, the Concatenate_Illumina_Lanes subworkflow will concatenate all files from the same lane before doing any subsequent analysis | | Optional | PE, SE | +| *workflow name | **read2_lane2** | File | If provided, the Concatenate_Illumina_Lanes subworkflow will concatenate all files from the same lane before doing any subsequent analysis | | Optional | PE, SE | +| *workflow name | **read2_lane3** | File | If provided, the Concatenate_Illumina_Lanes subworkflow will concatenate all files from the same lane before doing any subsequent analysis | | Optional | PE, SE | +| *workflow name | **read2_lane4** | File | If provided, the Concatenate_Illumina_Lanes subworkflow will concatenate all files from the same lane before doing any subsequent analysis | | Optional | PE, SE | | *workflow name | **run_id** | String | Will be used in the "run_id" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | | *workflow name | **seq_method** | String | Will be used in the "seq_id" column in any taxon-specific tables created in the Export Taxon Tables task | | Optional | FASTA, ONT, PE, SE | | *workflow name | **skip_mash** | Boolean | If true, skips estimation of genome size and coverage in read screening steps. As a result, providing true also prevents screening using these parameters. | TRUE | Optional | ONT, SE | @@ -301,6 +307,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | merlin_magic | **call_poppunk** | Boolean | If "true", runs PopPUNK for GPSC cluster designation for S. pneumoniae | TRUE | Optional | FASTA, ONT, PE, SE | | merlin_magic | **call_shigeifinder_reads_input** | Boolean | If set to "true", the ShigEiFinder task will run again but using read files as input instead of the assembly file. Input is shown but not used for TheiaProk_FASTA. | FALSE | Optional | FASTA, ONT, PE, SE | | merlin_magic | **call_stxtyper** | Boolean | If set to "true", the StxTyper task will run on all samples regardless of the `gambit_predicted_taxon` output. Useful if you suspect a non-E.coli or non-Shigella sample contains stx genes. | FALSE | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **call_tbp_parser** | Boolean | If set to "true", activates the tbp_parser module and results in more outputs, including tbp_parser_looker_report_csv, tbp_parser_laboratorian_report_csv, tbp_parser_lims_report_csv, tbp_parser_coverage_report, and tbp_parser_genome_percent_coverage | FALSE | Optional | FASTA, ONT, PE, SE | | merlin_magic | **cauris_cladetyper_docker_image** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | | merlin_magic | **cladetyper_kmer_size** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | | merlin_magic | **cladetyper_ref_clade1** | File | *Provide an empty file if running TheiaProk on the command-line | | Do not modify, Optional | FASTA, ONT, PE, SE | @@ -407,27 +414,33 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | merlin_magic | **stxtyper_enable_debug** | Boolean | When enabled, additional messages are printed and files in `$TMPDIR` are not removed after running | FALSE | Optional | FASTA, ONT, PE, SE | | merlin_magic | **stxtyper_memory** | Int | Amount of memory (in GB) to allocate to the task | 4 | Optional | FASTA, ONT, PE, SE | | merlin_magic | **staphopia_sccmec_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_add_cs_lims** | Boolean | Set to true add cycloserine results to the LIMS report | FALSE | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbp_parser_coverage_regions_bed** | File | A bed file that lists the regions to be considered for QC | | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbp_parser_coverage_threshold** | Int | The minimum coverage for a region to pass QC in tbp_parser | 100 | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbp_parser_debug** | Boolean | Activate the debug mode on tbp_parser; increases logging outputs | FALSE | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbp_parser_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.6.0 | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbp_parser_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.4.0 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_debug** | Boolean | Activate the debug mode on tbp_parser; increases logging outputs | TRUE | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.2 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_etha237_frequency** | Float | Minimum frequency for a mutation in ethA at protein position 237 to pass QC in tbp-parser | 0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_expert_rule_regions_bed** | File | A file that contains the regions where R mutations and expert rules are applied | | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbp_parser_min_depth** | Int | Minimum depth for a variant to pass QC in tbp_parser | 10 | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbp_parser_min_frequency** | Int | The minimum frequency for a mutation to pass QC | 0.1 | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbp_parser_min_read_support** | Int | The minimum read support for a mutation to pass QC | 10 | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbp_parser_operator** | String | Fills the "operator" field in the tbp_parser output files | Operator not provided | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbp_parser_output_seq_method_type** | String | Fills out the "seq_method" field in the tbp_parser output files | Sequencing method not provided | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbprofiler_additional_outputs** | Boolean | If set to "true", activates the tbp_parser module and results in more outputs, including tbp_parser_looker_report_csv, tbp_parser_laboratorian_report_csv, tbp_parser_lims_report_csv, tbp_parser_coverage_report, and tbp_parser_genome_percent_coverage | FALSE | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbprofiler_cov_frac_threshold** | Int | A cutoff used to calculate the fraction of the region covered by ≀ this value | 1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_rpob449_frequency** | Float | Minimum frequency for a mutation at protein position 449 to pass QC in tbp-parser | 0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_rrl_frequency** | Float | Minimum frequency for a mutation in rrl to pass QC in tbp-parser | 0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_rrl_read_support** | Int | Minimum read support for a mutation in rrl to pass QC in tbp-parser | 10 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_rrs_frequency** | Float | Minimum frequency for a mutation in rrs to pass QC in tbp-parser | 0.1 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_rrs_read_support** | Int | Minimum read support for a mutation in rrs to pass QC in tbp-parser | 10 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbp_parser_tngs_data** | Boolean | Set to true to enable tNGS-specific parameters and runs in tbp-parser | FALSE | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbprofiler_custom_db** | File | TBProfiler uses by default the TBDB database; if you have a custom database you wish to use, you must provide a custom database in this field and set tbprofiler_run_custom_db to true | | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbprofiler_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/tbprofiler:4.4.2 | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbprofiler_mapper** | String | The mapping tool used in TBProfiler to align the reads to the reference genome; see TBProfiler’s original documentation for available options. | bwa | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbprofiler_min_af** | Float | The minimum allele frequency to call a variant | 0.1 | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbprofiler_min_af_pred** | Float | The minimum allele frequency to use a variant for resistance prediction | 0.1 | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbprofiler_min_depth** | Int | The minimum depth for a variant to be called. | 10 | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_run_cdph_db** | Boolean | TBProfiler uses by default the TBDB database; set this value to "true" to use the WHO v2 database with customizations for CDPH | FALSE | Optional | FASTA, ONT, PE, SE | | merlin_magic | **tbprofiler_run_custom_db** | Boolean | TBProfiler uses by default the TBDB database; if you have a custom database you wish to use, you must set this value to true and provide a custom database in the tbprofiler_custom_db field | FALSE | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbprofiler_variant_caller** | String | Select a different variant caller for TBProfiler to use by writing it in this block; see TBProfiler’s original documentation for available options. | freebayes | Optional | FASTA, ONT, PE, SE | -| merlin_magic | **tbprofiler_variant_calling_params** | String | Enter additional variant calling parameters in this free text input to customize how the variant caller works in TBProfiler | None | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_variant_caller** | String | Select a different variant caller for TBProfiler to use by writing it in this block; see TBProfiler’s original documentation for available options. | GATK | Optional | FASTA, ONT, PE, SE | +| merlin_magic | **tbprofiler_variant_calling_params** | String | Enter additional variant calling parameters in this free text input to customize how the variant caller works in TBProfiler | | Optional | FASTA, ONT, PE, SE | | merlin_magic | **theiaeuk** | Boolean | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | | merlin_magic | **virulencefinder_coverage_threshold** | Float | The threshold for minimum coverage | | Optional | FASTA, ONT, PE, SE | | merlin_magic | **virulencefinder_database** | String | The specific database to use | virulence_ecoli | Optional | FASTA, ONT, PE, SE | @@ -594,6 +607,17 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | --- | --- | | Task | [task_versioning.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/task_versioning.wdl) | +??? task "`concatenate_illumina_lanes`: Concatenate Multi-Lane Illumina FASTQs ==_for Illumina only_==" + + The `concatenate_illumina_lanes` task concatenates Illumina FASTQ files from multiple lanes into a single file. This task only runs if the `read1_lane2` input file has been provided. All read1 lanes are concatenated together and are used in subsequent tasks, as are the read2 lanes. These concatenated files are also provided as output. + + !!! techdetails "Concatenate Illumina Lanes Technical Details" + The `concatenate_illumina_lanes` task is run before any downstream steps take place. + + | | Links | + | --- | --- | + | Task | [wf_concatenate_illumina_lanes.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/file_handling/wf_concatenate_illumina_lanes.wdl) + ??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation" The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses [`fastq-scan`](https://github.com/rpetit3/fastq-scan) and bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow: @@ -722,7 +746,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl)
[wf_read_QC_trim_se.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_se.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl)| | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | @@ -737,7 +761,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_cg_pipeline.wdl) | + | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_cg_pipeline.wdl) | | Software Source Code | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Software Documentation | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Original Publication(s) | [A computational genomics pipeline for prokaryotic sequencing projects](https://academic.oup.com/bioinformatics/article/26/15/1819/188418) | @@ -746,7 +770,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al De Novo assembly will be undertaken only for samples that have sufficient read quantity and quality, as determined by the `screen` task assessment of clean reads. - In TheiaEuk, assembly is performed using the [Shovill](https://github.com/tseemann/shovill) pipeline. This undertakes the assembly with one of four assemblers ([SKESA](https://github.com/ncbi/SKESA) (default), [SPAdes](https://github.com/ablab/spades), [Velvet](https://github.com/dzerbino/velvet/), [Megahit](https://github.com/voutcn/megahit)), but also performs [a number of pre- and post-processing steps](https://github.com/tseemann/shovill#main-steps) to improve the resulting genome assembly. Shovill uses an estimated genome size (see [here](https://github.com/tseemann/shovill#--gsize)). If this is not provided by the user as an optional input, Shovill will estimate the genome size using [mash](https://mash.readthedocs.io/en/latest/index.html). Adaptor trimming can be undertaken with Shovill by setting the `trim` option to "true", but this is set to "false" by default as [alternative adapter trimming](https://www.notion.so/TheiaProk-Workflow-Series-89b9c08406094ec78d08a578fe861626?pvs=21) is undertaken in the TheiaEuk workflow. + In TheiaProk, assembly is performed using the [Shovill](https://github.com/tseemann/shovill) pipeline. This undertakes the assembly with one of four assemblers ([SKESA](https://github.com/ncbi/SKESA) (default), [SPAdes](https://github.com/ablab/spades), [Velvet](https://github.com/dzerbino/velvet/), [Megahit](https://github.com/voutcn/megahit)), but also performs [a number of pre- and post-processing steps](https://github.com/tseemann/shovill#main-steps) to improve the resulting genome assembly. Shovill uses an estimated genome size (see [here](https://github.com/tseemann/shovill#--gsize)). If this is not provided by the user as an optional input, Shovill will estimate the genome size using [mash](https://mash.readthedocs.io/en/latest/index.html). Adaptor trimming can be undertaken with Shovill by setting the `trim` option to "true", but this is set to "false" by default as [alternative adapter trimming](https://www.notion.so/TheiaProk-Workflow-Series-89b9c08406094ec78d08a578fe861626?pvs=21) is undertaken in the TheiaEuk workflow. ??? toggle "What is _de novo_ assembly?" _De novo_ assembly is the process or product of attempting to reconstruct a genome from scratch (without prior knowledge of the genome) using sequence reads. Assembly of fungal genomes from short-reads will produce multiple contigs per chromosome rather than a single contiguous sequence for each chromosome. @@ -754,8 +778,9 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al !!! techdetails "Shovill Technical Details" | | Links | | --- | --- | - | TheiaProk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_shovill.wdl#L3) | - | Software code repository and documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | TheiaEuk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_shovill.wdl#L3) | + | Software Source Code | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | Software Documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | #### ONT Data Core Tasks @@ -765,7 +790,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al **Estimated genome length**: - By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `RASUSA` and `dragonflye`. + By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `Rasusa` and `dragonflye`. **Plotting and quantifying long-read sequencing data:** `nanoplot` @@ -784,7 +809,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | Workflow | **TheiaProk_ONT** | | --- | --- | | Sub-workflow | [wf_read_QC_trim_ont.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_ont.wdl) | - | Tasks | [task_nanoplot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_nanoplot.wdl) [task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/b481ce48f3d527ab8f31e4ad8171769212cc091a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl) [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) [task_nanoq.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_nanoq.wdl) [task_tiptoft.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_detection/task_tiptoft.wdl) | + | Tasks | [task_nanoplot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_nanoplot.wdl) [task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl) [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) [task_nanoq.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_nanoq.wdl) [task_tiptoft.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_detection/task_tiptoft.wdl) | | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan), [NanoPlot](https://github.com/wdecoster/NanoPlot), [RASUSA](https://github.com/mbhall88/rasusa), [tiptoft](https://github.com/andrewjpage/tiptoft), [nanoq](https://github.com/esteinig/nanoq) | | Original Publication(s) | [NanoPlot paper](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911)
[RASUSA paper](https://doi.org/10.21105/joss.03941)
[Nanoq Paper](https://doi.org/10.21105/joss.02991)
[Tiptoft paper](https://doi.org/10.21105/joss.01021) | @@ -808,7 +833,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | --- | --- | | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_quast.wdl) | | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | - | Software Documentation | | + | Software Documentation | | | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | ??? task "`BUSCO`: Assembly Quality Assessment" @@ -892,7 +917,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al ??? task "`AMRFinderPlus`: AMR Genotyping (default)" - NCBI's [AMRFinderPlus](https://github.com/ncbi/amr/wiki) is the default antimicrobial resistance (AMR) detection tool used in TheiaProk. [ResFinder](https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) may be used alternatively and if so, AMRFinderPlus is not run. + NCBI's [AMRFinderPlus](https://github.com/ncbi/amr/wiki) is the default antimicrobial resistance (AMR) detection tool used in TheiaProk. ResFinder may be used alternatively and if so, AMRFinderPlus is not run. AMRFinderPlus identifies acquired antimicrobial resistance (AMR) genes, virulence genes, and stress genes. Such AMR genes confer resistance to antibiotics, metals, biocides, heat, or acid. For some taxa (see [here](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option)), AMRFinderPlus will provide taxa-specific results including filtering out genes that are almost ubiquitous in the taxa (intrinsic genes) and identifying resistance-associated point mutations. In TheiaProk, the taxon used by AMRFinderPlus is specified based on the `gambit_predicted_taxon` or a user-provided `expected_taxon`. @@ -1047,7 +1072,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Task | [task_plasmidfinder.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_typing/task_plasmidfinder.wdl) | + | Task | [task_plasmidfinder.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_detection/task_plasmidfinder.wdl) | | Software Source Code | https://bitbucket.org/genomicepidemiology/plasmidfinder/src/master/ | | Software Documentation | https://bitbucket.org/genomicepidemiology/plasmidfinder/src/master/ | | Original Publication(s) | [In Silico Detection and Typing of Plasmids using PlasmidFinder and Plasmid Multilocus Sequence Typing](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4068535/) | @@ -1076,7 +1101,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Task | [task_qc_check.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_qc_check.wdl) | + | Task | [task_qc_check_phb.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparistask_qc_check_phb.wdl.wdl) | ??? task "`Taxon Tables`: Copy outputs to new data tables based on taxonomic assignment (optional)" @@ -1323,7 +1348,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | | Links | | --- | --- | - | Task | [task_kleborate.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/haemophilus/task_kleborate.wdl) | + | Task | [task_kleborate.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/klebsiella/task_kleborate.wdl) | | Software Source Code | [kleborate on GitHub](https://github.com/katholt/Kleborate) | | Software Documentation | https://github.com/katholt/Kleborate/wiki | | Orginal publication | [A genomic surveillance framework and genotyping tool for Klebsiella pneumoniae and its related species complex](https://www.nature.com/articles/s41467-021-24448-3)
[Identification of Klebsiella capsule synthesis loci from whole genome data](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000102) | @@ -1534,7 +1559,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after ??? task "`PopPUNK`: Global Pneumococcal Sequence Cluster typing" - Global Pneumococcal Sequence Clusters (GPSC) define and name pneumococcal strains. GPSC designation is undertaken using the PopPUNK software and GPSC database as described in the file below, obtained from [here](https://www.pneumogen.net/gps/training_command_line.html). + Global Pneumococcal Sequence Clusters (GPSC) define and name pneumococcal strains. GPSC designation is undertaken using the PopPUNK software and GPSC database as described in the file below, obtained from [here](https://www.pneumogen.net/gps/#/training#command-line). :file: [GPSC_README_PopPUNK2.txt](../../assets/files/GPSC_README_PopPUNK2.txt) @@ -1547,9 +1572,9 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | | Links | | --- | --- | | Task | [task_poppunk_streppneumo.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/streptococcus/task_poppunk_streppneumo.wdl) | - | GPSC database | https://www.pneumogen.net/gps/training_command_line.html | + | GPSC database | | | Software Source Code | [PopPunk](https://github.com/bacpop/PopPUNK) | - | Software Documentation | https://poppunk.readthedocs.io/en/latest/ | + | Software Documentation | | | Original Publication(s) | [Fast and flexible bacterial genomic epidemiology with PopPUNK](https://genome.cshlp.org/content/29/2/304) | ??? task "`SeroBA`: Serotyping ==_for Illumina_PE only_==" @@ -2021,7 +2046,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | tbp_parser_version | String | Optional output. The version of tbp-parser | ONT, PE | | tbprofiler_dr_type | String | Drug resistance type predicted by TB-Profiler (sensitive, Pre-MDR, MDR, Pre-XDR, XDR) | ONT, PE, SE | | tbprofiler_main_lineage | String | Lineage(s) predicted by TBProfiler | ONT, PE, SE | -| tbprofiler_median_coverage | Int | The median coverage of the H37Rv TB reference genome | ONT, PE | +| tbprofiler_median_depth | Int | The median depth of the H37Rv TB reference genome covered by the sample | ONT, PE | | tbprofiler_output_bai | File | Index BAM file generated by mapping sequencing reads to reference genome by TBProfiler | ONT, PE, SE | | tbprofiler_output_bam | File | BAM alignment file produced by TBProfiler | ONT, PE, SE | | tbprofiler_output_file | File | CSV report from TBProfiler | ONT, PE, SE | diff --git a/docs/workflows/genomic_characterization/vadr_update.md b/docs/workflows/genomic_characterization/vadr_update.md index ceaa45fa8..b3d706d72 100644 --- a/docs/workflows/genomic_characterization/vadr_update.md +++ b/docs/workflows/genomic_characterization/vadr_update.md @@ -5,7 +5,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v1.2.1 | Yes | Sample-level | +| [Genomic Characterization](../../workflows_overview/workflows_type.md/#genomic-characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Sample-level | ## Vadr_Update_PHB diff --git a/docs/workflows/phylogenetic_construction/augur.md b/docs/workflows/phylogenetic_construction/augur.md index d8eb10f9f..45d92ad5b 100644 --- a/docs/workflows/phylogenetic_construction/augur.md +++ b/docs/workflows/phylogenetic_construction/augur.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.1.0 | Yes | Sample-level, Set-level | +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.3.0 | Yes | Sample-level, Set-level | ## Augur Workflows @@ -14,10 +14,10 @@ Two workflows are offered: **Augur_Prep_PHB** and **Augur_PHB**. These must be r !!! dna "**Helpful resources for epidemiological interpretation**" - - [introduction to Nextstrain](https://www.cdc.gov/amd/training/covid-toolkit/module3-1.html) (which includes Auspice) - - guide to Nextstrain [interactive trees](https://www.cdc.gov/amd/training/covid-toolkit/module3-4.html) - - an [introduction to UShER](https://www.cdc.gov/amd/training/covid-toolkit/module3-3.html) - - a video about [how to read trees](https://www.cdc.gov/amd/training/covid-toolkit/module1-3.html) if this is new to you + - [introduction to Nextstrain](https://www.cdc.gov/advanced-molecular-detection/php/training/module-3-1.html) (which includes Auspice) + - guide to Nextstrain [interactive trees](https://www.cdc.gov/advanced-molecular-detection/php/training/module-3-4.html) + - an [introduction to UShER](https://www.cdc.gov/advanced-molecular-detection/php/training/module-3-3.html) + - a video about [how to read trees](https://www.cdc.gov/advanced-molecular-detection/php/training/module-1-3.html) if this is new to you - documentation on [how to identify SARS-CoV-2 recombinants](https://github.com/pha4ge/pipeline-resources/blob/main/docs/sc2-recombinants.md) ### Augur_Prep_PHB @@ -174,7 +174,7 @@ The Augur_PHB workflow takes in a ***set*** of SARS-CoV-2 (or any other viral This workflow runs on the set level. Please note that for every task, runtime parameters are modifiable (cpu, disk_size, docker, and memory); most of these values have been excluded from the table below for convenience. -
+
| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| @@ -198,7 +198,7 @@ This workflow runs on the set level. Please note that for every task, runtime pa | augur_ancestral | **inference** | String | Calculate joint or marginal maximum likelihood ancestral sequence states; options: "joint", "marginal" | joint | Optional | | augur_ancestral | **keep_ambiguous** | Boolean | If true, do not infer nucleotides at ambiguous (N) sides | FALSE | Optional | | augur_ancestral | **keep_overhangs** | Boolean | If true, do not infer nucleotides for gaps on either side of the alignment | FALSE | Optional | -| augur_export | **colors_tsv** | File | Custom color definitions, one per line in the format TRAIT_TYPE \| TRAIT_VALUE\tHEX_CODE | | Optional | +| augur_export | **colors_tsv** | File | Custom color definitions, one per line in TSV format with the following fields: TRAIT_TYPE TRAIT_VALUE HEX_CODE | | Optional | | augur_export | **description_md** | File | Markdown file with description of build and/or acknowledgements | | Optional | | augur_export | **include_root_sequence** | Boolean | Export an additional JSON containing the root sequence used to identify mutations | FALSE | Optional | | augur_export | **title** | String | Title to be displayed by Auspice | | Optional | @@ -284,9 +284,13 @@ The Nextstrain team hosts documentation surrounding the Augur workflow → Auspi | **Variable** | **Type** | **Description** | | --- | --- | --- | | aligned_fastas | File | A FASTA file of the aligned genomes | -| augur_iqtree_model_used | String | The iqtree model used during augur tree | +| augur_fasttree_version | String | The fasttree version used, blank if other tree method used | +| augur_iqtree_model_used | String | The iqtree model used during augur tree, blank if iqtree not used | +| augur_iqtree_version | String | The iqtree version used during augur tree (defualt), blank if other tree method used | +| augur_mafft_version | String | The mafft version used in augur align | | augur_phb_analysis_date | String | The date the analysis was run | | augur_phb_version | String | The version of the Public Health Bioinformatics (PHB) repository used | +| augur_raxml_version | String | The version of raxml used during augur tree, blank if other tree method used | | augur_version | String | Version of Augur used | | auspice_input_json | File | JSON file used as input to Auspice | | combined_assemblies | File | Concatenated FASTA file containing all samples | diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline.md b/docs/workflows/phylogenetic_construction/snippy_streamline.md index c794be4c8..facc3e1c4 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.2.0 | Yes; some optional features incompatible | Set-level | +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.3.0 | Yes; some optional features incompatible | Set-level | ## Snippy_Streamline_PHB @@ -173,11 +173,7 @@ For all cases: `Snippy_Variants` aligns reads for each sample against the reference genome. As part of `Snippy_Streamline`, the only output from this workflow is the `snippy_variants_outdir_tarball` which is provided in the set-level data table. Please see the full documentation for [Snippy_Variants](./snippy_variants.md) for more information. -??? task "snippy_variants (qc_metrics output)" - - ##### snippy_variants {#snippy_variants} - - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + This task also extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: - **samplename**: The name of the sample. - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. @@ -195,9 +191,17 @@ For all cases: - **meanbaseq**: Mean base quality over the reference sequence. - **meanmapq**: Mean mapping quality over the reference sequence. - These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`). The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + !!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses - **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ??? task "Snippy_Tree workflow" diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md index 352d5a55c..118c66588 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.2.0 | Yes; some optional features incompatible | Set-level | +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.3.0 | Yes; some optional features incompatible | Set-level | ## Snippy_Streamline_FASTA_PHB @@ -39,11 +39,11 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a ### Workflow Tasks -??? task "snippy_variants (qc_metrics output)" +??? task "Snippy_Variants QC Metrics Concatenation (optional)" - ##### snippy_variants {#snippy_variants} + ##### Snippy_Variants QC Metric Concatenation (optional) {#snippy_variants} - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + Optionally, the user can provide the `snippy_variants_qc_metrics` file produced by the Snippy_Variants workflow as input to the workflow to concatenate the reports for each sample in the tree. These per-sample QC metrics include the following columns: - **samplename**: The name of the sample. - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. @@ -61,9 +61,17 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a - **meanbaseq**: Mean base quality over the reference sequence. - **meanmapq**: Mean mapping quality over the reference sequence. - These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. - **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + !!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses, and we recommend examining them before proceeding with phylogenetic analysis if performing Snippy_Variants and Snippy_Tree separately. + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Inputs diff --git a/docs/workflows/phylogenetic_construction/snippy_tree.md b/docs/workflows/phylogenetic_construction/snippy_tree.md index d6c0a272b..d28160bbb 100644 --- a/docs/workflows/phylogenetic_construction/snippy_tree.md +++ b/docs/workflows/phylogenetic_construction/snippy_tree.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.1.0 | Yes; some optional features incompatible | Set-level | +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.3.0 | Yes; some optional features incompatible | Set-level | ## Snippy_Tree_PHB @@ -266,7 +266,7 @@ Sequencing data used in the Snippy_Tree workflow must: | | Links | | --- | --- | - | Task | [task_summarize_data.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_summarize_data.wdl) | + | Task | [task_summarize_data.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_handling/task_summarize_data.wdl) | ??? task "Concatenate Variants (optional)" @@ -310,11 +310,11 @@ Sequencing data used in the Snippy_Tree workflow must: | Task | task_shared_variants.wdl | | Software Source Code | [task_shared_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_shared_variants.wdl) | -??? task "snippy_variants (qc_metrics output)" +??? task "Snippy_Variants QC Metrics Concatenation (optional)" - ##### snippy_variants {#snippy_variants} + ##### Snippy_Variants QC Metric Concatenation (optional) {#snippy_variants} - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + Optionally, the user can provide the `snippy_variants_qc_metrics` file produced by the Snippy_Variants workflow as input to the workflow to concatenate the reports for each sample in the tree. These per-sample QC metrics include the following columns: - **samplename**: The name of the sample. - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. @@ -332,9 +332,17 @@ Sequencing data used in the Snippy_Tree workflow must: - **meanbaseq**: Mean base quality over the reference sequence. - **meanmapq**: Mean mapping quality over the reference sequence. - These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. - **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + !!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses, and we recommend examining them before proceeding with phylogenetic analysis if performing Snippy_Variants and Snippy_Tree separately. + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Outputs diff --git a/docs/workflows/phylogenetic_construction/snippy_variants.md b/docs/workflows/phylogenetic_construction/snippy_variants.md index 4ec73569a..f4fc65a37 100644 --- a/docs/workflows/phylogenetic_construction/snippy_variants.md +++ b/docs/workflows/phylogenetic_construction/snippy_variants.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics), [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Sample-level | +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics), [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.3.0 | Yes | Sample-level | ## Snippy_Variants_PHB @@ -60,14 +60,40 @@ The `Snippy_Variants` workflow aligns single-end or paired-end reads (in FASTQ f ### Workflow Tasks -`Snippy_Variants` uses the snippy tool to align reads to the reference and call SNPs, MNPs and INDELs according to optional input parameters. The output includes a file of variants that is then queried using the `grep` bash command to identify any mutations in specified genes or annotations of interest. The query string MUST match the gene name or annotation as specified in the GenBank file and provided in the output variant file in the `snippy_results` column. - -Additionally, `Snippy_Variants` extracts quality control (QC) metrics from the Snippy output for each sample. These per-sample QC metrics are saved in TSV files (`snippy_variants_qc_metrics`). The QC metrics include: - -- **Percentage of reads aligned to the reference genome** (`snippy_variants_percent_reads_aligned`). -- **Percentage of the reference genome covered at or above the specified depth threshold** (`snippy_variants_percent_ref_coverage`). - -These per-sample QC metrics can be combined into a single file (`snippy_combined_qc_metrics`) in downstream workflows, such as `snippy_tree_wf`, providing an overview of QC metrics across all samples. +`Snippy_Variants` uses Snippy to align reads to the reference and call SNPs, MNPs and INDELs according to optional input parameters. The output includes a file of variants that is then queried using the `grep` bash command to identify any mutations in specified genes or annotations of interest. The query string MUST match the gene name or annotation as specified in the GenBank file and provided in the output variant file in the `snippy_results` column. + +!!! info "Quality Control Metrics" + Additionally, `Snippy_Variants` extracts quality control (QC) metrics from the Snippy output for each sample. These per-sample QC metrics are saved in TSV files (`snippy_variants_qc_metrics`). The QC metrics include: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome; also available in the `snippy_variants_percent_reads_aligned` output column. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10); also available in the `snippy_variants_percent_ref_coverage` output column. + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≄ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + +!!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses, and we recommend examining them before proceeding with phylogenetic analysis if performing Snippy_Variants and Snippy_Tree separately. + + These per-sample QC metrics can also be combined into a single file (`snippy_combined_qc_metrics`) in downstream workflows, such as `snippy_tree`, providing an overview of QC metrics across all samples. + +!!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Outputs @@ -92,6 +118,7 @@ These per-sample QC metrics can be combined into a single file (`snippy_combined | snippy_variants_outdir_tarball | File | A compressed file containing the whole directory of snippy output files. This is used when running Snippy_Tree | | snippy_variants_percent_reads_aligned | Float | Percentage of reads aligned to the reference genome | | snippy_variants_percent_ref_coverage| Float | Proportion of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). | +| snippy_variants_qc_metrics | File | TSV file containing quality control metrics for the sample | | snippy_variants_query | String | Query strings specified by the user when running the workflow | | snippy_variants_query_check | String | Verification that query strings are found in the reference genome | | snippy_variants_results | File | CSV file detailing results for all mutations identified in the query sequence relative to the reference | @@ -99,4 +126,4 @@ These per-sample QC metrics can be combined into a single file (`snippy_combined | snippy_variants_version | String | Version of Snippy used | | snippy_variants_wf_version | String | Version of Snippy_Variants used | -
\ No newline at end of file +
diff --git a/docs/workflows/public_data_sharing/fetch_srr_accession.md b/docs/workflows/public_data_sharing/fetch_srr_accession.md new file mode 100644 index 000000000..df432d1ca --- /dev/null +++ b/docs/workflows/public_data_sharing/fetch_srr_accession.md @@ -0,0 +1,52 @@ +# Fetch SRR Accession Workflow + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Data Import](../../workflows_overview/workflows_type.md/#data-import) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.3.0 | Yes | Sample-level | + +## Fetch SRR Accession + +This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. The primary inputs are BioSample IDs (e.g., SAMN00000000) or SRA Experiment IDs (e.g., SRX000000), which link to sequencing data in the SRA repository. + +The workflow uses the fastq-dl tool to fetch metadata from SRA and specifically parses this metadata to extract the associated SRR accession and outputs the SRR accession. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description**| **Default Value** | **Terra Status** | +| --- | --- | --- | --- | --- | --- | +| fetch_srr_metadata | **sample_accession** | String | SRA-compatible accession, such as a **BioSample ID** (e.g., "SAMN00000000") or **SRA Experiment ID** (e.g., "SRX000000"), used to retrieve SRR metadata. | | Required | +| fetch_srr_metadata | **cpu** | Int | Number of CPUs allocated for the task. | 2 | Optional | +| fetch_srr_metadata | **disk_size** | Int | Disk space in GB allocated for the task. | 10 | Optional | +| fetch_srr_metadata | **docker**| String | Docker image for metadata retrieval. | `us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0` | Optional | +| fetch_srr_metadata | **memory** | Int | Memory in GB allocated for the task. | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Tasks + +This workflow has a single task that performs metadata retrieval for the specified sample accession. + +??? task "`fastq-dl`: Fetches SRR metadata for sample accession" + When provided a BioSample accession or SRA experiment ID, 'fastq-dl' collects metadata and returns the appropriate SRR accession. + + !!! techdetails "fastq-dl Technical Details" + | | Links | + | --- | --- | + | Task | [Task on GitHub](https://github.com/theiagen-org/phb-workflows/blob/main/tasks/utilities/data_handling/task_fetch_srr_metadata.wdl) | + | Software Source Code | [fastq-dl Source](https://github.com/rvalieris/fastq-dl) | + | Software Documentation | [fastq-dl Documentation](https://github.com/rvalieris/fastq-dl#documentation) | + | Original Publication | [fastq-dl: A fast and reliable tool for downloading SRA metadata](https://doi.org/10.1186/s12859-021-04346-3) | + +### Outputs + +| **Variable** | **Type** | **Description**| +|---|---|---| +| srr_accession| String | The SRR accession's associated with the input sample accession.| +| fetch_srr_accession_version | String | The version of the fetch_srr_accession workflow. | +| fetch_srr_accession_analysis_date | String | The date the fetch_srr_accession analysis was run. | + +## References + +> Valieris, R. et al., "fastq-dl: A fast and reliable tool for downloading SRA metadata." Bioinformatics, 2021. diff --git a/docs/workflows/public_data_sharing/mercury_prep_n_batch.md b/docs/workflows/public_data_sharing/mercury_prep_n_batch.md index 4fcc48d36..56e169e82 100644 --- a/docs/workflows/public_data_sharing/mercury_prep_n_batch.md +++ b/docs/workflows/public_data_sharing/mercury_prep_n_batch.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Set-level | +| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.3.0 | Yes | Set-level | ## Mercury_Prep_N_Batch_PHB diff --git a/docs/workflows/public_data_sharing/terra_2_ncbi.md b/docs/workflows/public_data_sharing/terra_2_ncbi.md index 0fa48e50e..54e17aa9d 100644 --- a/docs/workflows/public_data_sharing/terra_2_ncbi.md +++ b/docs/workflows/public_data_sharing/terra_2_ncbi.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Bacteria](../../workflows_overview/workflows_kingdom.md#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics) [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.1.0 | No | Set-level | +| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Bacteria](../../workflows_overview/workflows_kingdom.md#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics) [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.3.0 | No | Set-level | ## Terra_2_NCBI_PHB diff --git a/docs/workflows/standalone/concatenate_illumina_lanes.md b/docs/workflows/standalone/concatenate_illumina_lanes.md new file mode 100644 index 000000000..282844fa4 --- /dev/null +++ b/docs/workflows/standalone/concatenate_illumina_lanes.md @@ -0,0 +1,47 @@ +# Concatenate Illumina Lanes + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB 2.3.0 | Yes | Sample-level | + +## Concatenate_Illumina_Lanes_PHB + +Some Illumina machines produce multi-lane FASTQ files for a single sample. This workflow concatenates the multiple lanes into a single FASTQ file per read type (forward or reverse). + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| concatenate_illumina_lanes | **read1_lane1** | File | The first lane for the forward reads | | Required | +| concatenate_illumina_lanes | **read1_lane2** | File | The second lane for the forward reads | | Required | +| concatenate_illumina_lanes | **samplename** | String | The name of the sample, used to name the output files | | Required | +| cat_lanes | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| cat_lanes | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| cat_lanes | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/utility:1.2" | Optional | +| cat_lanes | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| concatenate_illumina_lanes | **read1_lane3** | File | The third lane for the forward reads | | Optional | +| concatenate_illumina_lanes | **read1_lane4** | File | The fourth lane for the forward reads | | Optional | +| concatenate_illumina_lanes | **read2_lane1** | File | The first lane for the reverse reads | | Optional | +| concatenate_illumina_lanes | **read2_lane2** | File | The second lane for the reverse reads | | Optional | +| concatenate_illumina_lanes | **read2_lane3** | File | The third lane for the reverse reads | | Optional | +| concatenate_illumina_lanes | **read2_lane4** | File | The fourth lane for the reverse reads | | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Tasks + +This workflow concatenates the Illumina lanes for forward and (if provided) reverse reads. The output files are named as followed: + +- Forward reads: `_merged_R1.fastq.gz` +- Reverse reads: `_merged_R2.fastq.gz` + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| concatenate_illumina_lanes_analysis_date | String | Date of analysis | +| concatenate_illumina_lanes_version | String | Version of PHB used for the analysis | +| read1_concatenated | File | Concatenated forward reads | +| read2_concatenated | File | Concatenated reverse reads | diff --git a/docs/workflows/standalone/kraken2.md b/docs/workflows/standalone/kraken2.md index df36e56a1..95c86c216 100644 --- a/docs/workflows/standalone/kraken2.md +++ b/docs/workflows/standalone/kraken2.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.0.0 | Yes | Sample-level | +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.3.0 | Yes | Sample-level | ## Kraken2 Workflows diff --git a/docs/workflows/standalone/ncbi_scrub.md b/docs/workflows/standalone/ncbi_scrub.md index 0ae60c49b..65537070d 100644 --- a/docs/workflows/standalone/ncbi_scrub.md +++ b/docs/workflows/standalone/ncbi_scrub.md @@ -23,6 +23,7 @@ There are three Kraken2 workflows: | dehost_pe or dehost_se | **read1** | File | | | Required | PE, SE | | dehost_pe or dehost_se | **read2** | File | | | Required | PE | | dehost_pe or dehost_se | **samplename** | String | | | Required | PE, SE | +| dehost_pe or dehost_se | **target_organism** | String | Target organism for Kraken2 reporting | "Severe acute respiratory syndrome coronavirus 2" | Optional | PE, SE | | kraken2 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | PE, SE | | kraken2 | **disk_size** | Int | Amount of storage (in GB) to allocate to the task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | PE, SE | | kraken2 | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | PE, SE | @@ -66,7 +67,7 @@ This workflow is composed of two tasks, one to dehost the input reads and anothe | | Links | | --- | --- | - | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | diff --git a/docs/workflows/standalone/tbprofiler_tngs.md b/docs/workflows/standalone/tbprofiler_tngs.md index d0061fdd7..b9bec3abe 100644 --- a/docs/workflows/standalone/tbprofiler_tngs.md +++ b/docs/workflows/standalone/tbprofiler_tngs.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.0.0 | Yes | Sample-level | +| [Standalone](../../workflows_overview/workflows_type.md/#standalone) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.3.0 | Yes | Sample-level | ## TBProfiler_tNGS_PHB @@ -23,7 +23,7 @@ This workflow is still in experimental research stages. Documentation is minimal | tbp_parser | **coverage_threshold** | Int | The minimum percentage of a region to exceed the minimum depth for a region to pass QC in tbp_parser | 100 | Optional | | tbp_parser | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | | tbp_parser | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | -| tbp_parser | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.6.0 | Optional | +| tbp_parser | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.2 | Optional | | tbp_parser | **etha237_frequency** | Float | Minimum frequency for a mutation in ethA at protein position 237 to pass QC in tbp-parser | 0.1 | Optional | | tbp_parser | **expert_rule_regions_bed** | File | A file that contains the regions where R mutations and expert rules are applied | | Optional | | tbp_parser | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | diff --git a/docs/workflows_overview/workflows_alphabetically.md b/docs/workflows_overview/workflows_alphabetically.md index 6cf7e101d..a6d5f9cb5 100644 --- a/docs/workflows_overview/workflows_alphabetically.md +++ b/docs/workflows_overview/workflows_alphabetically.md @@ -11,44 +11,47 @@ title: Alphabetical Workflows | **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | |---|---|---|---|---|---|---| | [**Assembly_Fetch**](../workflows/data_import/assembly_fetch.md) | Download assemblies from NCBI, after optionally identifying the closest RefSeq reference genome to your own draft assembly | Any taxa | Sample-level | Yes | v1.3.0 | [Assembly_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Assembly_Fetch_PHB:main?tab=info) | -| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.1.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | +| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.3.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | | [**BaseSpace_Fetch**](../workflows/data_import/basespace_fetch.md)| Import data from BaseSpace into Terra | Any taxa | Sample-level | Yes | v2.0.0 | [BaseSpace_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/BaseSpace_Fetch_PHB:main?tab=info) | | [**Cauris_CladeTyper**](../workflows/standalone/cauris_cladetyper.md)| C. auris clade assignment | Mycotics | Sample-level | Yes | v1.0.0 | [Cauris_CladeTyper_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Cauris_CladeTyper_PHB:main?tab=info) | | [**Concatenate_Column_Content**](../workflows/data_export/concatenate_column_content.md) | Concatenate contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Concatenate_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Column_Content_PHB:main?tab=info) | +| [**Concatenate_Illumina_Lanes**](../workflows/standalone/concatenate_illumina_lanes.md)| Concatenate Illumina lanes for a single sample | Any taxa | Sample-level | Yes | v2.3.0 | [Concatenate_Illumina_Lanes_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Illumina_Lanes_PHB:main?tab=info) | | [**Core_Gene_SNP**](../workflows/phylogenetic_construction/core_gene_snp.md) | Pangenome analysis | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Core_Gene_SNP_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Core_Gene_SNP_PHB:main?tab=info) | | [**Create_Terra_Table**](../workflows/data_import/create_terra_table.md)| Upload data to Terra and then run this workflow to have the table automatically created | Any taxa | | Yes | v2.2.0 | [Create_Terra_Table_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Create_Terra_Table_PHB:main?tab=info) | | [**CZGenEpi_Prep**](../workflows/phylogenetic_construction/czgenepi_prep.md)| Prepare metadata and fasta files for easy upload to the CZ GEN EPI platform. | Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v1.3.0 | [CZGenEpi_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/CZGenEpi_Prep_PHB:main?tab=info) | | [**Dorado_Basecalling**](../workflows/standalone/dorado_basecalling.md)| GPU-accelerated basecalling of Oxford Nanopore sequencing data | Any taxa | Sample-level | Yes | v2.3.0 | [Dorado_Basecalling_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Dorado_Basecalling_PHB:main?tab=info) | +| [**Fetch_SRR_Accession**](../workflows/public_data_sharing/fetch_srr_accession.md)| Provided a BioSample accession, identify any associated SRR accession(s) | Any taxa | Sample-level | Yes | v2.3.0 | [*Fetch_SRR_Accession_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Fetch_SRR_Accession_PHB:main?tab=info) | | [**Find_Shared_Variants**](../workflows/phylogenetic_construction/find_shared_variants.md)| Combines and reshapes variant data from Snippy_Variants to illustrate variants shared across multiple samples | Bacteria, Mycotics | Set-level | Yes | v2.0.0 | [Find_Shared_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Find_Shared_Variants_PHB:main?tab=info) | | [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.2.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | | [**GAMBIT_Query**](../workflows/standalone/gambit_query.md)| Taxon identification of genome assembly using GAMBIT | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [Gambit_Query_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Gambit_Query_PHB:main?tab=info) | -| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.0.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | +| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.3.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | | [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | | [**Lyve_SET**](../workflows/phylogenetic_construction/lyve_set.md)| Alignment of reads to a reference genome, SNP calling, curation of high quality SNPs, phylogenetic analysis | Bacteria | Set-level | Yes | v2.1.0 | [Lyve_SET_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Lyve_SET_PHB:main?tab=info) | | [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | -| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.2.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | +| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.3.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | | [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | | [**NCBI_Scrub**](../workflows/standalone/ncbi_scrub.md)| Runs NCBI's HRRT on Illumina FASTQs | Any taxa | Sample-level | Yes | v2.2.1 | [NCBI_Scrub_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI_Scrub_PE_PHB:main?tab=info), [NCBI_Scrub_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI_Scrub_SE_PHB:main?tab=info) | | [**Pangolin_Update**](../workflows/genomic_characterization/pangolin_update.md) | Update Pangolin assignments | SARS-CoV-2, Viral | Sample-level | Yes | v2.0.0 | [Pangolin_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Pangolin_Update_PHB:main?tab=info) | | [**RASUSA**](../workflows/standalone/rasusa.md)| Randomly subsample sequencing reads to a specified coverage | Any taxa | Sample-level | Yes | v2.0.0 | [RASUSA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/RASUSA_PHB:main?tab=info) | | [**Rename_FASTQ**](../workflows/standalone/rename_fastq.md)| Rename paired-end or single-end read files in a Terra data table in a non-destructive way | Any taxa | Sample-level | Yes | v2.1.0 | [Rename_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Rename_FASTQ_PHB:im-utilities-rename-files?tab=info) | | [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/samples_to_ref_tree.md)| Use Nextclade to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Samples_to_Ref_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Samples_to_Ref_Tree_PHB:main?tab=info) | -| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | -| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | -| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | -| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.3.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | +| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.3.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | +| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.3.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.3.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | | [**SRA_Fetch**](../workflows/data_import/sra_fetch.md)| Import publicly available reads from SRA using SRR#, ERR# or DRR# | Any taxa | Sample-level | Yes | v2.2.0 | [SRA_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/SRA_Fetch_PHB:main?tab=info) | -| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.0.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | +| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.3.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | | [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | -| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | -| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | -| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | -| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | -| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.3.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.3.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | | [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | | [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | | [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | -| [**Usher_PHB**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | +| [**Usher_PHB**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v2.1.0 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | +| [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v2.2.0 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | | [**Zip_Column_Content**](../workflows/data_export/zip_column_content.md)| Zip contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Zip_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Zip_Column_Content_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_kingdom.md b/docs/workflows_overview/workflows_kingdom.md index 906e0acdd..88a8432b6 100644 --- a/docs/workflows_overview/workflows_kingdom.md +++ b/docs/workflows_overview/workflows_kingdom.md @@ -15,14 +15,17 @@ title: Workflows by Kingdom | [**Assembly_Fetch**](../workflows/data_import/assembly_fetch.md) | Download assemblies from NCBI, after optionally identifying the closest RefSeq reference genome to your own draft assembly | Any taxa | Sample-level | Yes | v1.3.0 | [Assembly_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Assembly_Fetch_PHB:main?tab=info) | | [**BaseSpace_Fetch**](../workflows/data_import/basespace_fetch.md)| Import data from BaseSpace into Terra | Any taxa | Sample-level | Yes | v2.0.0 | [BaseSpace_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/BaseSpace_Fetch_PHB:main?tab=info) | | [**Concatenate_Column_Content**](../workflows/data_export/concatenate_column_content.md) | Concatenate contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Concatenate_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Column_Content_PHB:main?tab=info) | +| [**Concatenate_Illumina_Lanes**](../workflows/standalone/concatenate_illumina_lanes.md)| Concatenate Illumina lanes for a single sample | Any taxa | Sample-level | Yes | v2.3.0 | [Concatenate_Illumina_Lanes_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Illumina_Lanes_PHB:main?tab=info) | | [**Create_Terra_Table**](../workflows/data_import/create_terra_table.md)| Upload data to Terra and then run this workflow to have the table automatically created | Any taxa | | Yes | v2.2.0 | [Create_Terra_Table_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Create_Terra_Table_PHB:main?tab=info) | | [**Dorado_Basecalling**](../workflows/standalone/dorado_basecalling.md)| GPU-accelerated basecalling of Oxford Nanopore sequencing data | Any taxa | Sample-level | Yes | v2.3.0 | [Dorado_Basecalling_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Dorado_Basecalling_PHB:main?tab=info) | | [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.0.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | +| [**Fetch_SRR_Accession**](../workflows/public_data_sharing/fetch_srr_accession.md)| Provided a BioSample accession, identify any associated SRR accession(s) | Any taxa | Sample-level | Yes | v2.3.0 | [Fetch_SRR_Accession_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Fetch_SRR_Accession_PHB:main?tab=info) | +| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.3.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | | [**NCBI_Scrub**](../workflows/standalone/ncbi_scrub.md)| Runs NCBI's HRRT on Illumina FASTQs | Any taxa | Sample-level | Yes | v2.2.1 | [NCBI_Scrub_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI_Scrub_PE_PHB:main?tab=info), [NCBI_Scrub_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI_Scrub_SE_PHB:main?tab=info) | | [**RASUSA**](../workflows/standalone/rasusa.md)| Randomly subsample sequencing reads to a specified coverage | Any taxa | Sample-level | Yes | v2.0.0 | [RASUSA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/RASUSA_PHB:main?tab=info) | | [**Rename_FASTQ**](../workflows/standalone/rename_fastq.md)| Rename paired-end or single-end read files in a Terra data table in a non-destructive way | Any taxa | Sample-level | Yes | v2.1.0 | [Rename_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Rename_FASTQ_PHB:im-utilities-rename-files?tab=info) | | [**SRA_Fetch**](../workflows/data_import/sra_fetch.md)| Import publicly available reads from SRA using SRR#, ERR# or DRR# | Any taxa | Sample-level | Yes | v2.2.0 | [SRA_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/SRA_Fetch_PHB:main?tab=info) | -| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.3.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | | [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | | [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | | [**Zip_Column_Content**](../workflows/data_export/zip_column_content.md)| Zip contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Zip_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Zip_Column_Content_PHB:main?tab=info) | @@ -42,13 +45,13 @@ title: Workflows by Kingdom | [**Lyve_SET**](../workflows/phylogenetic_construction/lyve_set.md)| Alignment of reads to a reference genome, SNP calling, curation of high quality SNPs, phylogenetic analysis | Bacteria | Set-level | Yes | v2.1.0 | [Lyve_SET_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Lyve_SET_PHB:main?tab=info) | | [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | | [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | -| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | -| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | -| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | -| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | -| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.0.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | +| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.3.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | +| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.3.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | +| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.3.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.3.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.3.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | | [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | -| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | +| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | @@ -64,9 +67,9 @@ title: Workflows by Kingdom | [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | | [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | | [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | -| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | -| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | -| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.3.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.3.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | @@ -76,20 +79,20 @@ title: Workflows by Kingdom | **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | |---|---|---|---|---|---|---| -| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.1.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | +| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.3.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | | [**CZGenEpi_Prep**](../workflows/phylogenetic_construction/czgenepi_prep.md)| Prepare metadata and fasta files for easy upload to the CZ GEN EPI platform. | Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v1.3.0 | [CZGenEpi_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/CZGenEpi_Prep_PHB:main?tab=info) | -| [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.2.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | +| [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.3.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | | [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | | [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | -| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.2.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | +| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.3.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | | [**Pangolin_Update**](../workflows/genomic_characterization/pangolin_update.md) | Update Pangolin assignments | SARS-CoV-2, Viral | Sample-level | Yes | v2.0.0 | [Pangolin_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Pangolin_Update_PHB:main?tab=info) | | [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/samples_to_ref_tree.md)| Use Nextclade to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Samples_to_Ref_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Samples_to_Ref_Tree_PHB:main?tab=info) | -| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.3.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | | [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | -| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | -| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.3.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | | [**Usher_PHB**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | -| [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | +| [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v2.2.0 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_type.md b/docs/workflows_overview/workflows_type.md index 979e2a35c..204d960dc 100644 --- a/docs/workflows_overview/workflows_type.md +++ b/docs/workflows_overview/workflows_type.md @@ -15,6 +15,7 @@ title: Workflows by Type | [**Assembly_Fetch**](../workflows/data_import/assembly_fetch.md) | Download assemblies from NCBI, after optionally identifying the closest RefSeq reference genome to your own draft assembly | Any taxa | Sample-level | Yes | v1.3.0 | [Assembly_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Assembly_Fetch_PHB:main?tab=info) | | [**BaseSpace_Fetch**](../workflows/data_import/basespace_fetch.md)| Import data from BaseSpace into Terra | Any taxa | Sample-level | Yes | v2.0.0 | [BaseSpace_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/BaseSpace_Fetch_PHB:main?tab=info) | | [**Create_Terra_Table**](../workflows/data_import/create_terra_table.md)| Upload data to Terra and then run this workflow to have the table automatically created | Any taxa | | Yes | v2.2.0 | [Create_Terra_Table_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Create_Terra_Table_PHB:main?tab=info) | +| [**Fetch_SRR_Accession**](../workflows/public_data_sharing/fetch_srr_accession.md)| Provided a BioSample accession, identify any associated SRR accession(s) | Any taxa | Sample-level | Yes | v2.3.0 | [Fetch_SRR_Accession_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Fetch_SRR_Accession_PHB:main?tab=info) | | [**SRA_Fetch**](../workflows/data_import/sra_fetch.md)| Import publicly available reads from SRA using SRR#, ERR# or DRR# | Any taxa | Sample-level | Yes | v2.2.0 | [SRA_Fetch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/SRA_Fetch_PHB:main?tab=info) | @@ -25,13 +26,13 @@ title: Workflows by Type | **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | |---|---|---|---|---|---|---| -| [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.2.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | +| [**Freyja Workflow Series**](../workflows/genomic_characterization/freyja.md)| Recovers relative lineage abundances from mixed sample data and generates visualizations | SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.3.0 | [Freyja_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_FASTQ_PHB:main?tab=info), [Freyja_Plot_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Plot_PHB:main?tab=info), [Freyja_Dashboard_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Dashboard_PHB:main?tab=info), [Freyja_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Freyja_Update_PHB:main?tab=info) | | [**Pangolin_Update**](../workflows/genomic_characterization/pangolin_update.md) | Update Pangolin assignments | SARS-CoV-2, Viral | Sample-level | Yes | v2.0.0 | [Pangolin_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Pangolin_Update_PHB:main?tab=info) | -| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | -| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | -| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | -| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | -| [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | +| [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.3.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.3.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | +| [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v2.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | @@ -41,17 +42,17 @@ title: Workflows by Type | **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | |---|---|---|---|---|---|---| -| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.1.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | +| [**Augur**](../workflows/phylogenetic_construction/augur.md) | Phylogenetic analysis for viral pathogens | Viral | Sample-level, Set-level | Yes | v2.3.0 | [Augur_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_Prep_PHB:main?tab=info), [Augur_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Augur_PHB:main?tab=info) | | [**Core_Gene_SNP**](../workflows/phylogenetic_construction/core_gene_snp.md) | Pangenome analysis | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Core_Gene_SNP_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Core_Gene_SNP_PHB:main?tab=info) | | [**CZGenEpi_Prep**](../workflows/phylogenetic_construction/czgenepi_prep.md)| Prepare metadata and fasta files for easy upload to the CZ GEN EPI platform. | Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v1.3.0 | [CZGenEpi_Prep_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/CZGenEpi_Prep_PHB:main?tab=info) | | [**Find_Shared_Variants**](../workflows/phylogenetic_construction/find_shared_variants.md)| Combines and reshapes variant data from Snippy_Variants to illustrate variants shared across multiple samples | Bacteria, Mycotics | Set-level | Yes | v2.0.0 | [Find_Shared_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Find_Shared_Variants_PHB:main?tab=info) | | [**kSNP3**](../workflows/phylogenetic_construction/ksnp3.md)| SNP-based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [kSNP3_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/kSNP3_PHB:main?tab=info) | | [**Lyve_SET**](../workflows/phylogenetic_construction/lyve_set.md)| Alignment of reads to a reference genome, SNP calling, curation of high quality SNPs, phylogenetic analysis | Bacteria | Set-level | Yes | v2.1.0 | [Lyve_SET_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Lyve_SET_PHB:main?tab=info) | | [**MashTree_FASTA**](../workflows/phylogenetic_construction/mashtree_fasta.md)| Mash-distance based phylogenetic analysis from assemblies | Bacteria, Mycotics, Viral | Set-level | Some optional features incompatible, Yes | v2.1.0 | [MashTree_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/MashTree_FASTA_PHB:main?tab=info) | -| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | -| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.2.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | -| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.1.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | -| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.2.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | +| [**Snippy_Streamline**](../workflows/phylogenetic_construction/snippy_streamline.md)| Implementation of Snippy workflows for phylogenetic analysis from reads, with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.3.0 | [Snippy_Streamline_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_PHB:main?tab=info) | +| [**Snippy_Streamline_FASTA**](../workflows/phylogenetic_construction/snippy_streamline_fasta.md)| Implementation of Snippy workflows for phylogenetic analysis from assembled genomes (in FASTA format), with optional dynamic reference selection | Bacteria | Set-level | Yes | v2.3.0 | [Snippy_Streamline_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Streamline_FASTA_PHB:im-snippy-fasta-dev?tab=info) | +| [**Snippy_Tree**](../workflows/phylogenetic_construction/snippy_tree.md)| SNP-based phylogenetic analysis from reads, with option to mask recombination | Bacteria | Set-level | Some optional features incompatible, Yes | v2.3.0 | [Snippy_Tree_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Tree_PHB:main?tab=info) | +| [**Snippy_Variants**](../workflows/phylogenetic_construction/snippy_variants.md)| Alignment of reads to a reference genome, then SNP calling | Bacteria, Mycotics, Viral | Sample-level | Yes | v2.3.0 | [Snippy_Variants_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Snippy_Variants_PHB:main?tab=info) | @@ -72,7 +73,7 @@ title: Workflows by Type | **Name** | **Description** | **Applicable Kingdom** | **Workflow Level** | **Command-line Compatibility**[^1] | **Last Known Changes** | **Dockstore** | |---|---|---|---|---|---|---| -| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.2.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | +| [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.3.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | | [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | | [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | @@ -98,13 +99,14 @@ title: Workflows by Type |---|---|---|---|---|---|---| | [**Cauris_CladeTyper**](../workflows/standalone/cauris_cladetyper.md)| C. auris clade assignment | Mycotics | Sample-level | Yes | v1.0.0 | [Cauris_CladeTyper_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Cauris_CladeTyper_PHB:main?tab=info) | | [**Dorado_Basecalling**](../workflows/standalone/dorado_basecalling.md)| GPU-accelerated basecalling of Oxford Nanopore sequencing data | Any taxa | Sample-level | Yes | v2.3.0 | [Dorado_Basecalling_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Dorado_Basecalling_PHB:main?tab=info) | +| [**Concatenate_Illumina_Lanes**](../workflows/standalone/concatenate_illumina_lanes.md)| Concatenate Illumina lanes for a single sample | Any taxa | Sample-level | Yes | v2.3.0 | [Concatenate_Illumina_Lanes_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Concatenate_Illumina_Lanes_PHB:main?tab=info) | | [**GAMBIT_Query**](../workflows/standalone/gambit_query.md)| Taxon identification of genome assembly using GAMBIT | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [Gambit_Query_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Gambit_Query_PHB:main?tab=info) | -| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.0.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | +| [**Kraken2**](../workflows/standalone/kraken2.md) | Taxa identification from reads | Any taxa | Sample-level | Yes | v2.3.0 | [Kraken2_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_PE_PHB:main?tab=info), [Kraken2_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Kraken2_SE_PHB:main?tab=info) | | [**NCBI-AMRFinderPlus**](../workflows/standalone/ncbi_amrfinderplus.md)| Runs NCBI's AMRFinderPlus on genome assemblies (bacterial and fungal) | Bacteria, Mycotics | Sample-level | Yes | v2.0.0 | [NCBI-AMRFinderPlus_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI-AMRFinderPlus_PHB:main?tab=info) | | [**NCBI_Scrub**](../workflows/standalone/ncbi_scrub.md)| Runs NCBI's HRRT on Illumina FASTQs | Any taxa | Sample-level | Yes | v2.2.1 | [NCBI_Scrub_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI_Scrub_PE_PHB:main?tab=info), [NCBI_Scrub_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/NCBI_Scrub_SE_PHB:main?tab=info) | | [**RASUSA**](../workflows/standalone/rasusa.md)| Randomly subsample sequencing reads to a specified coverage | Any taxa | Sample-level | Yes | v2.0.0 | [RASUSA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/RASUSA_PHB:main?tab=info) | | [**Rename_FASTQ**](../workflows/standalone/rename_fastq.md)| Rename paired-end or single-end read files in a Terra data table in a non-destructive way | Any taxa | Sample-level | Yes | v2.1.0 | [Rename_FASTQ_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Rename_FASTQ_PHB:im-utilities-rename-files?tab=info) | -| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.0.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | +| [**TBProfiler_tNGS**](../workflows/standalone/tbprofiler_tngs.md)| Performs in silico antimicrobial susceptibility testing on Mycobacterium tuberculosis targeted-NGS samples with TBProfiler and tbp-parser | Bacteria, TB | Sample-level | Yes | v2.3.0 | [TBProfiler_tNGS_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TBProfiler_tNGS_PHB:smw-tngs-tbprofiler-dev?tab=info) | | [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | diff --git a/mkdocs.yml b/mkdocs.yml index b750dd21d..30a3c5e30 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,6 +43,7 @@ nav: - Samples_to_Ref_Tree: workflows/phylogenetic_placement/samples_to_ref_tree.md - Usher_PHB: workflows/phylogenetic_placement/usher.md - Public Data Sharing: + - Fetch_SRR_Accession: workflows/public_data_sharing/fetch_srr_accession.md - Mercury_Prep_N_Batch: workflows/public_data_sharing/mercury_prep_n_batch.md - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md @@ -53,6 +54,7 @@ nav: - Standalone: - Cauris_CladeTyper: workflows/standalone/cauris_cladetyper.md - Dorado_Basecalling: workflows/standalone/dorado_basecalling.md + - Concatenate_Illumina_Lanes: workflows/standalone/concatenate_illumina_lanes.md - GAMBIT_Query: workflows/standalone/gambit_query.md - Kraken2: workflows/standalone/kraken2.md - NCBI-AMRFinderPlus: workflows/standalone/ncbi_amrfinderplus.md @@ -66,7 +68,8 @@ nav: - Any Taxa: - Assembly_Fetch: workflows/data_import/assembly_fetch.md - BaseSpace_Fetch: workflows/data_import/basespace_fetch.md - - Concatenate_Column_Content: workflows/data_export/concatenate_column_content.md + - Concatenate_Column_Content: workflows/data_export/concatenate_column_content.md + - Concatenate_Illumina_Lanes: workflows/standalone/concatenate_illumina_lanes.md - Create_Terra_Table: workflows/data_import/create_terra_table.md - Dorado_Basecalling: workflows/standalone/dorado_basecalling.md - Kraken2: workflows/standalone/kraken2.md @@ -125,6 +128,7 @@ nav: - BaseSpace_Fetch: workflows/data_import/basespace_fetch.md - Cauris_CladeTyper: workflows/standalone/cauris_cladetyper.md - Concatenate_Column_Content: workflows/data_export/concatenate_column_content.md + - Concatenate_Illumina_Lanes: workflows/standalone/concatenate_illumina_lanes.md - Core_Gene_SNP: workflows/phylogenetic_construction/core_gene_snp.md - Create_Terra_Table: workflows/data_import/create_terra_table.md - CZGenEpi_Prep: workflows/phylogenetic_construction/czgenepi_prep.md diff --git a/tasks/phylogenetic_inference/augur/task_augur_align.wdl b/tasks/phylogenetic_inference/augur/task_augur_align.wdl index 30065c8b2..e8cdca2a0 100644 --- a/tasks/phylogenetic_inference/augur/task_augur_align.wdl +++ b/tasks/phylogenetic_inference/augur/task_augur_align.wdl @@ -12,8 +12,13 @@ task augur_align { String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/augur:22.0.2--pyhdfd78af_0" } command <<< + set -euo pipefail + # capture version information augur version > VERSION + echo + echo "mafft version:" + mafft --version 2>&1 | tee MAFFT_VERSION # run augur align augur align \ @@ -26,6 +31,7 @@ task augur_align { output { File aligned_fasta = "alignment.fasta" String augur_version = read_string("VERSION") + String mafft_version = read_string("MAFFT_VERSION") } runtime { docker: docker diff --git a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl index 22bd469e7..a9a6b7b20 100644 --- a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl +++ b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl @@ -16,8 +16,30 @@ task augur_tree { String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/augur:22.0.2--pyhdfd78af_0" } command <<< + set -euo pipefail + # capture version information augur version > VERSION + echo + + # touch the version files to ensure they exist (so that read_string output function doesn't fail) + touch IQTREE_VERSION FASTTREE_VERSION RAXML_VERSION + + # capture version information only for the method selected by user OR default of iqtree + if [ "~{method}" == "iqtree" ]; then + echo "iqtree version:" + iqtree --version | grep version | sed 's/.*version/version/;s/ for Linux.*//' | tee IQTREE_VERSION + elif [ "~{method}" == "fasttree" ]; then + echo "fasttree version:" + # fasttree prints to STDERR, so we need to redirect it to STDOUT, then grep for line with version info, then cut to extract version number (and nothing else) + fasttree -help 2>&1 | grep -m 1 "FastTree" | cut -d ' ' -f 2 | tee FASTTREE_VERSION + elif [ "~{method}" == "raxml" ]; then + echo "raxml version:" + raxmlHPC -v | grep RAxML | sed -e 's/.*RAxML version //' -e 's/released.*//' | tee RAXML_VERSION + fi + + echo + echo "Running augur tree now..." AUGUR_RECURSION_LIMIT=10000 augur tree \ --alignment "~{aligned_fasta}" \ @@ -34,7 +56,7 @@ task augur_tree { if [ "~{substitution_model}" == "auto" ]; then FASTA_BASENAME=$(basename ~{aligned_fasta} .fasta) FASTA_DIR=$(dirname ~{aligned_fasta}) - MODEL=$(grep "Best-fit model:" ${FASTA_DIR}/${FASTA_BASENAME}-delim.iqtree.log | sed 's|Best-fit model: ||g;s|chosen.*||' | tr -d '\n\r') + MODEL=$(grep "Best-fit model:" ${FASTA_DIR}/*${FASTA_BASENAME}-delim.iqtree.log | sed 's|Best-fit model: ||g;s|chosen.*||' | tr -d '\n\r') else MODEL="~{substitution_model}" fi @@ -42,11 +64,17 @@ task augur_tree { else echo "" > FINAL_MODEL.txt fi + + echo + echo "DEBUG: FINAL_MODEL.txt is: $(cat FINAL_MODEL.txt)" >>> output { File aligned_tree = "~{build_name}_~{method}.nwk" String augur_version = read_string("VERSION") + String iqtree_version = read_string("IQTREE_VERSION") + String fasttree_version = read_string("FASTTREE_VERSION") + String raxml_version = read_string("RAXML_VERSION") String iqtree_model_used = read_string("FINAL_MODEL.txt") } runtime { diff --git a/tasks/quality_control/read_filtering/task_trimmomatic.wdl b/tasks/quality_control/read_filtering/task_trimmomatic.wdl index e8a246497..42f62559d 100644 --- a/tasks/quality_control/read_filtering/task_trimmomatic.wdl +++ b/tasks/quality_control/read_filtering/task_trimmomatic.wdl @@ -40,9 +40,9 @@ task trimmomatic_pe { -threads ~{cpu} \ ~{read1} ~{read2} \ -baseout ~{samplename}.fastq.gz \ + "${CROPPING_VAR}" \ SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \ - MINLEN:~{trimmomatic_min_length} &> ~{samplename}.trim.stats.txt \ - "${CROPPING_VAR}" + MINLEN:~{trimmomatic_min_length} &> ~{samplename}.trim.stats.txt >>> output { diff --git a/tasks/species_typing/mycobacterium/task_tbp_parser.wdl b/tasks/species_typing/mycobacterium/task_tbp_parser.wdl index 310ee61fd..3ccdb6d0d 100644 --- a/tasks/species_typing/mycobacterium/task_tbp_parser.wdl +++ b/tasks/species_typing/mycobacterium/task_tbp_parser.wdl @@ -9,17 +9,18 @@ task tbp_parser { String? sequencing_method String? operator + Int? min_depth # default 10 - Int? coverage_threshold # default 100 (--min_percent_coverage) - File? coverage_regions_bed Float? min_frequency # default 0.1 Int? min_read_support # default 10 + + Int? coverage_threshold # default 100 (--min_percent_coverage) + File? coverage_regions_bed - Boolean tbp_parser_debug = false - Boolean add_cycloserine_lims = false - + Boolean tbp_parser_debug = true Boolean tngs_data = false + Float? rrs_frequency # default 0.1 Int? rrs_read_support # default 10 Float? rrl_frequency # default 0.1 @@ -27,11 +28,11 @@ task tbp_parser { Float? rpob449_frequency # default 0.1 Float? etha237_frequency # default 0.1 File? expert_rule_regions_bed - - String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:1.6.0" - Int disk_size = 100 - Int memory = 4 + Int cpu = 1 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.2" + Int memory = 4 } command <<< # get version @@ -42,10 +43,10 @@ task tbp_parser { ~{"--sequencing_method " + sequencing_method} \ ~{"--operator " + operator} \ ~{"--min_depth " + min_depth} \ - ~{"--min_percent_coverage " + coverage_threshold} \ - ~{"--coverage_regions " + coverage_regions_bed} \ ~{"--min_frequency " + min_frequency} \ ~{"--min_read_support " + min_read_support} \ + ~{"--min_percent_coverage " + coverage_threshold} \ + ~{"--coverage_regions " + coverage_regions_bed} \ ~{"--tngs_expert_regions " + expert_rule_regions_bed} \ ~{"--rrs_frequency " + rrs_frequency} \ ~{"--rrs_read_support " + rrs_read_support} \ @@ -63,7 +64,7 @@ task tbp_parser { echo 0.0 > AVG_DEPTH # get genome percent coverage for the entire reference genome length over min_depth - genome=$(samtools depth -J ~{tbprofiler_bam} | awk -F "\t" '{if ($3 >= ~{min_depth}) print;}' | wc -l ) + genome=$(samtools depth -J ~{tbprofiler_bam} | awk -F "\t" -v min_depth=~{min_depth} '{if ($3 >= min_depth) print;}' | wc -l ) python3 -c "print ( ($genome / 4411532 ) * 100 )" | tee GENOME_PC # get genome average depth diff --git a/tasks/species_typing/mycobacterium/task_tbprofiler.wdl b/tasks/species_typing/mycobacterium/task_tbprofiler.wdl index 21c4f4b15..bc81ce419 100644 --- a/tasks/species_typing/mycobacterium/task_tbprofiler.wdl +++ b/tasks/species_typing/mycobacterium/task_tbprofiler.wdl @@ -5,84 +5,74 @@ task tbprofiler { File read1 File? read2 String samplename - - # logic Boolean ont_data = false - Boolean tbprofiler_run_custom_db = false - File? tbprofiler_custom_db - # minimum thresholds - Int cov_frac_threshold = 1 - Float min_af = 0.1 - Float min_af_pred = 0.1 - Int min_depth = 10 - # tool options within tbprofiler + String mapper = "bwa" - String variant_caller = "freebayes" + String variant_caller = "gatk" String? variant_calling_params - # runtime + + String? additional_parameters # for tbprofiler + + Int min_depth = 10 + Float min_af = 0.1 + + File? tbprofiler_custom_db + Boolean tbprofiler_run_cdph_db = false + Boolean tbprofiler_run_custom_db = false + Int cpu = 8 Int disk_size = 100 - String docker = "us-docker.pkg.dev/general-theiagen/staphb/tbprofiler:4.4.2" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/tbprofiler:6.4.1" Int memory = 16 } command <<< - # Print and save date - date | tee DATE - # Print and save version tb-profiler version > VERSION && sed -i -e 's/TBProfiler version //' VERSION && sed -n -i '$p' VERSION # check if file is non existant or non empty - if [ -z "~{read2}" ] || [ ! -s "~{read2}" ] ; then + if [ -z "~{read2}" ] || [ ! -s "~{read2}" ]; then INPUT_READS="-1 ~{read1}" else INPUT_READS="-1 ~{read1} -2 ~{read2}" fi - - if [ "~{ont_data}" = true ]; then - mode="--platform nanopore" - export ont_data="true" - else - export ont_data="false" - fi # check if new database file is provided and not empty - if [ "~{tbprofiler_run_custom_db}" = true ] ; then - echo "Found new database file ~{tbprofiler_custom_db}" - prefix=$(basename "~{tbprofiler_custom_db}" | sed 's/\.tar\.gz$//') - echo "New database will be created with prefix $prefix" - - echo "Inflating the new database..." - tar xfv ~{tbprofiler_custom_db} + if ~{tbprofiler_run_custom_db}; then + if [ ! -s ~{tbprofiler_custom_db} ]; then + echo "Custom database file is empty" + TBDB="" + else + echo "Found new database file ~{tbprofiler_custom_db}" + prefix=$(basename "~{tbprofiler_custom_db}" | sed 's/\.tar\.gz$//') + tar xfv ~{tbprofiler_custom_db} + + tb-profiler load_library ./"$prefix"/"$prefix" - tb-profiler load_library ./"$prefix"/"$prefix" - - TBDB="--db $prefix" - else - TBDB="" + TBDB="--db $prefix" + fi + elif ~{tbprofiler_run_cdph_db}; then + tb-profiler update_tbdb --branch CaliforniaDPH + TBDB="--db CaliforniaDPH" fi # Run tb-profiler on the input reads with samplename prefix tb-profiler profile \ - ${mode} \ ${INPUT_READS} \ --prefix ~{samplename} \ --mapper ~{mapper} \ --caller ~{variant_caller} \ --calling_params "~{variant_calling_params}" \ - --min_depth ~{min_depth} \ + --depth ~{min_depth} \ --af ~{min_af} \ - --reporting_af ~{min_af_pred} \ - --coverage_fraction_threshold ~{cov_frac_threshold} \ + --threads ~{cpu} \ --csv --txt \ - $TBDB + ~{true="--platform nanopore" false="" ont_data} \ + ~{additional_parameters} \ + ${TBDB} # Collate results tb-profiler collate --prefix ~{samplename} - # touch optional output files because wdl - touch GENE_NAME LOCUS_TAG VARIANT_SUBSTITUTIONS OUTPUT_SEQ_METHOD_TYPE - # merge all vcf files if multiple are present bcftools index ./vcf/*bcf bcftools index ./vcf/*gz @@ -97,35 +87,32 @@ task tbprofiler { tsv_reader=csv.reader(tsv_file, delimiter="\t") tsv_data=list(tsv_reader) tsv_dict=dict(zip(tsv_data[0], tsv_data[1])) - with open ("MAIN_LINEAGE", 'wt') as Main_Lineage: - main_lin=tsv_dict['main_lineage'] - Main_Lineage.write(main_lin) - with open ("SUB_LINEAGE", 'wt') as Sub_Lineage: - sub_lin=tsv_dict['sub_lineage'] - Sub_Lineage.write(sub_lin) - with open ("DR_TYPE", 'wt') as DR_Type: - dr_type=tsv_dict['DR_type'] - DR_Type.write(dr_type) - with open ("NUM_DR_VARIANTS", 'wt') as Num_DR_Variants: - num_dr_vars=tsv_dict['num_dr_variants'] - Num_DR_Variants.write(num_dr_vars) - with open ("NUM_OTHER_VARIANTS", 'wt') as Num_Other_Variants: - num_other_vars=tsv_dict['num_other_variants'] - Num_Other_Variants.write(num_other_vars) - with open ("RESISTANCE_GENES", 'wt') as Resistance_Genes: - res_genes_list=['rifampicin', 'isoniazid', 'pyrazinamide', 'ethambutol', 'streptomycin', 'fluoroquinolones', 'moxifloxacin', 'ofloxacin', 'levofloxacin', 'ciprofloxacin', 'aminoglycosides', 'amikacin', 'kanamycin', 'capreomycin', 'ethionamide', 'para-aminosalicylic_acid', 'cycloserine', 'linezolid', 'bedaquiline', 'clofazimine', 'delamanid'] + + with open ("MAIN_LINEAGE", 'wt') as main_lineage: + main_lineage.write(tsv_dict['main_lineage']) + with open ("SUB_LINEAGE", 'wt') as sublineage: + sublineage.write(tsv_dict['sub_lineage']) + + with open ("DR_TYPE", 'wt') as dr_type: + dr_type.write(tsv_dict['drtype']) + with open ("NUM_DR_VARIANTS", 'wt') as num_dr_variants: + num_dr_variants.write(tsv_dict['num_dr_variants']) + with open ("NUM_OTHER_VARIANTS", 'wt') as num_other_variants: + num_other_variants.write(tsv_dict['num_other_variants']) + + with open ("RESISTANCE_GENES", 'wt') as resistance_genes: + res_genes_list=['rifampicin', 'isoniazid', 'ethambutol', 'pyrazinamide', 'moxifloxacin', 'levofloxacin', 'bedaquiline', 'delamanid', 'pretomanid', 'linezolid', 'streptomycin', 'amikacin', 'kanamycin', 'capreomycin', 'clofazimine', 'ethionamide', 'para-aminosalicylic_acid', 'cycloserine'] res_genes=[] for i in res_genes_list: if tsv_dict[i] != '-': res_genes.append(tsv_dict[i]) res_genes_string=';'.join(res_genes) - Resistance_Genes.write(res_genes_string) - with open ("MEDIAN_COVERAGE", 'wt') as Median_Coverage: - median_coverage=tsv_dict['median_coverage'] - Median_Coverage.write(median_coverage) - with open ("PCT_READS_MAPPED", 'wt') as Pct_Reads_Mapped: - pct_reads_mapped=tsv_dict['pct_reads_mapped'] - Pct_Reads_Mapped.write(pct_reads_mapped) + resistance_genes.write(res_genes_string) + + with open ("MEDIAN_DEPTH", 'wt') as median_depth: + median_depth.write(tsv_dict['target_median_depth']) + with open ("PCT_READS_MAPPED", 'wt') as pct_reads_mapped: + pct_reads_mapped.write(tsv_dict['pct_reads_mapped']) CODE >>> output { @@ -134,7 +121,7 @@ task tbprofiler { File tbprofiler_output_json = "./results/~{samplename}.results.json" File tbprofiler_output_bam = "./bam/~{samplename}.bam" File tbprofiler_output_bai = "./bam/~{samplename}.bam.bai" - File tbprofiler_output_vcf = "./vcf/~{samplename}.targets.csq.merged.vcf" + File? tbprofiler_output_vcf = "./vcf/~{samplename}.targets.csq.merged.vcf" String version = read_string("VERSION") String tbprofiler_main_lineage = read_string("MAIN_LINEAGE") String tbprofiler_sub_lineage = read_string("SUB_LINEAGE") @@ -142,7 +129,7 @@ task tbprofiler { String tbprofiler_num_dr_variants = read_string("NUM_DR_VARIANTS") String tbprofiler_num_other_variants = read_string("NUM_OTHER_VARIANTS") String tbprofiler_resistance_genes = read_string("RESISTANCE_GENES") - Int tbprofiler_median_coverage = read_int("MEDIAN_COVERAGE") + Float tbprofiler_median_depth = read_float("MEDIAN_DEPTH") Float tbprofiler_pct_reads_mapped = read_float("PCT_READS_MAPPED") } runtime { diff --git a/tasks/task_versioning.wdl b/tasks/task_versioning.wdl index fab908614..04b9c5b11 100644 --- a/tasks/task_versioning.wdl +++ b/tasks/task_versioning.wdl @@ -9,7 +9,7 @@ task version_capture { volatile: true } command { - PHB_Version="PHB v2.2.1" + PHB_Version="PHB v2.3.0" ~{default='' 'export TZ=' + timezone} date +"%Y-%m-%d" > TODAY echo "$PHB_Version" > PHB_VERSION diff --git a/tasks/taxon_id/contamination/task_kraken2.wdl b/tasks/taxon_id/contamination/task_kraken2.wdl index fb1522c75..4a43106f6 100644 --- a/tasks/taxon_id/contamination/task_kraken2.wdl +++ b/tasks/taxon_id/contamination/task_kraken2.wdl @@ -5,25 +5,39 @@ task kraken2_theiacov { File read1 File? read2 String samplename - String kraken2_db = "/kraken2-db" + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" Int cpu = 4 Int memory = 8 String? target_organism Int disk_size = 100 - String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv" + String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db" } command <<< # date and version control date | tee DATE kraken2 --version | head -n1 | tee VERSION num_reads=$(ls *fastq.gz 2> /dev/nul | wc -l) + + # Decompress the Kraken2 database + mkdir db + tar -C ./db/ -xzvf ~{kraken2_db} + if ! [ -z ~{read2} ]; then mode="--paired" fi echo $mode - kraken2 $mode \ + + # determine if reads are compressed + if [[ ~{read1} == *.gz ]]; then + echo "Reads are compressed..." + compressed="--gzip-compressed" + fi + echo $compressed + + # Run Kraken2 + kraken2 $mode $compressed \ --threads ~{cpu} \ - --db ~{kraken2_db} \ + --db ./db/ \ ~{read1} ~{read2} \ --report ~{samplename}_kraken2_report.txt \ --output ~{samplename}.classifiedreads.txt @@ -31,22 +45,29 @@ task kraken2_theiacov { # Compress and cleanup gzip ~{samplename}.classifiedreads.txt + # capture human percentage percentage_human=$(grep "Homo sapiens" ~{samplename}_kraken2_report.txt | cut -f 1) - # | tee PERCENT_HUMAN - percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}_kraken2_report.txt | cut -f1 ) - # | tee PERCENT_COV if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi - if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi echo $percentage_human | tee PERCENT_HUMAN - echo $percentage_sc2 | tee PERCENT_SC2 - # capture target org percentage + + # capture target org percentage if [ ! -z "~{target_organism}" ]; then echo "Target org designated: ~{target_organism}" - percent_target_organism=$(grep "~{target_organism}" ~{samplename}_kraken2_report.txt | cut -f1 | head -n1 ) - if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi - else + # if target organisms is sc2, report it in a special legacy column called PERCENT_SC2 + if [[ "~{target_organism}" == "Severe acute respiratory syndrome coronavirus 2" ]]; then + percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}_kraken2_report.txt | cut -f1 ) + percent_target_organism="" + if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi + else + percentage_sc2="" + percent_target_organism=$(grep "~{target_organism}" ~{samplename}_kraken2_report.txt | cut -f1 | head -n1 ) + if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi + fi + else percent_target_organism="" + percentage_sc2="" fi + echo $percentage_sc2 | tee PERCENT_SC2 echo $percent_target_organism | tee PERCENT_TARGET_ORGANISM >>> @@ -55,7 +76,7 @@ task kraken2_theiacov { String version = read_string("VERSION") File kraken_report = "~{samplename}_kraken2_report.txt" Float percent_human = read_float("PERCENT_HUMAN") - Float percent_sc2 = read_float("PERCENT_SC2") + String percent_sc2 = read_string("PERCENT_SC2") String percent_target_organism = read_string("PERCENT_TARGET_ORGANISM") String? kraken_target_organism = target_organism File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz" @@ -205,30 +226,37 @@ task kraken2_parse_classified { CODE # theiacov parsing blocks - percent human, sc2 and target organism + # capture human percentage percentage_human=$(grep "Homo sapiens" ~{samplename}.report_parsed.txt | cut -f 1) - percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report_parsed.txt | cut -f1 ) - if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi - if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi echo $percentage_human | tee PERCENT_HUMAN - echo $percentage_sc2 | tee PERCENT_SC2 - # capture target org percentage - if [ ! -z "~{target_organism}" ]; then + # capture target org percentage + if [ ! -z "~{target_organism}" ]; then echo "Target org designated: ~{target_organism}" - percent_target_organism=$(grep "~{target_organism}" ~{samplename}.report_parsed.txt | cut -f1 | head -n1 ) - if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi - else + # if target organisms is sc2, report it in a special legacy column called PERCENT_SC2 + if [[ "~{target_organism}" == "Severe acute respiratory syndrome coronavirus 2" ]]; then + percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report_parsed.txt | cut -f1 ) + percent_target_organism="" + if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi + else + percentage_sc2="" + percent_target_organism=$(grep "~{target_organism}" ~{samplename}.report_parsed.txt | cut -f1 | head -n1 ) + if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi + fi + else percent_target_organism="" + percentage_sc2="" fi - echo $percent_target_organism | tee PERCENT_TARGET_ORG + echo $percentage_sc2 | tee PERCENT_SC2 + echo $percent_target_organism | tee PERCENT_TARGET_ORGANISM >>> output { File kraken_report = "~{samplename}.report_parsed.txt" Float percent_human = read_float("PERCENT_HUMAN") - Float percent_sc2 = read_float("PERCENT_SC2") - String percent_target_organism = read_string("PERCENT_TARGET_ORG") + String percent_sc2 = read_string("PERCENT_SC2") + String percent_target_organism = read_string("PERCENT_TARGET_ORGANISM") String? kraken_target_organism = target_organism } runtime { diff --git a/tasks/taxon_id/freyja/task_freyja.wdl b/tasks/taxon_id/freyja/task_freyja.wdl index a0894e55e..b3a7ed2c2 100644 --- a/tasks/taxon_id/freyja/task_freyja.wdl +++ b/tasks/taxon_id/freyja/task_freyja.wdl @@ -5,7 +5,8 @@ task freyja_one_sample { File primer_trimmed_bam String samplename File reference_genome - File? freyja_usher_barcodes + String? freyja_pathogen + File? freyja_barcodes File? freyja_lineage_metadata Float? eps Float? adapt @@ -16,7 +17,7 @@ task freyja_one_sample { Int? depth_cutoff Int memory = 8 Int cpu = 2 - String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02" Int disk_size = 100 } command <<< @@ -44,9 +45,9 @@ task freyja_one_sample { freyja_metadata_version="freyja update: $(date +"%Y-%m-%d")" else # configure barcode - if [[ ! -z "~{freyja_usher_barcodes}" ]]; then - echo "User freyja usher barcodes identified; ~{freyja_usher_barcodes} will be utilized for freyja demixing" - freyja_usher_barcode_version=$(basename -- "~{freyja_usher_barcodes}") + if [[ ! -z "~{freyja_barcodes}" ]]; then + echo "User freyja usher barcodes identified; ~{freyja_barcodes} will be utilized for freyja demixing" + freyja_usher_barcode_version=$(basename -- "~{freyja_barcodes}") else freyja_usher_barcode_version="unmodified from freyja container: ~{docker}" fi @@ -74,9 +75,10 @@ task freyja_one_sample { # Calculate Boostraps, if specified if ~{bootstrap}; then freyja boot \ + ~{"--pathogen" + freyja_pathogen} \ ~{"--eps " + eps} \ ~{"--meta " + freyja_lineage_metadata} \ - ~{"--barcodes " + freyja_usher_barcodes} \ + ~{"--barcodes " + freyja_barcodes} \ ~{"--depthcutoff " + depth_cutoff} \ ~{"--nb " + number_bootstraps } \ ~{true='--confirmedonly' false='' confirmed_only} \ @@ -91,7 +93,7 @@ task freyja_one_sample { freyja demix \ ~{'--eps ' + eps} \ ~{'--meta ' + freyja_lineage_metadata} \ - ~{'--barcodes ' + freyja_usher_barcodes} \ + ~{'--barcodes ' + freyja_barcodes} \ ~{'--depthcutoff ' + depth_cutoff} \ ~{true='--confirmedonly' false='' confirmed_only} \ ~{'--adapt ' + adapt} \ @@ -144,7 +146,7 @@ task freyja_one_sample { File? freyja_bootstrap_summary = "~{samplename}_summarized.csv" File? freyja_bootstrap_summary_pdf = "~{samplename}_summarized.pdf" # capture barcode file - first is user supplied, second appears if the user did not supply a barcode file - File freyja_usher_barcode_file = select_first([freyja_usher_barcodes, "usher_barcodes.feather"]) + File freyja_barcode_file = select_first([freyja_barcodes, "usher_barcodes.feather"]) File freyja_lineage_metadata_file = select_first([freyja_lineage_metadata, "curated_lineages.json"]) String freyja_barcode_version = read_string("FREYJA_BARCODES") String freyja_metadata_version = read_string("FREYJA_METADATA") diff --git a/tasks/taxon_id/freyja/task_freyja_dashboard.wdl b/tasks/taxon_id/freyja/task_freyja_dashboard.wdl index a463a4cf6..24be429a9 100644 --- a/tasks/taxon_id/freyja/task_freyja_dashboard.wdl +++ b/tasks/taxon_id/freyja/task_freyja_dashboard.wdl @@ -13,7 +13,7 @@ task freyja_dashboard_task { Boolean scale_by_viral_load = false String freyja_dashboard_title File? dashboard_intro_text - String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02" Int disk_size = 100 Int memory = 4 Int cpu = 2 diff --git a/tasks/taxon_id/freyja/task_freyja_plot.wdl b/tasks/taxon_id/freyja/task_freyja_plot.wdl index 82735e1a4..7c02572cb 100644 --- a/tasks/taxon_id/freyja/task_freyja_plot.wdl +++ b/tasks/taxon_id/freyja/task_freyja_plot.wdl @@ -10,7 +10,7 @@ task freyja_plot_task { String plot_time_interval="MS" Int plot_day_window=14 String freyja_plot_name - String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02" Int disk_size = 100 Int mincov = 60 Int memory = 4 diff --git a/tasks/taxon_id/freyja/task_freyja_update.wdl b/tasks/taxon_id/freyja/task_freyja_update.wdl index d877ba282..14bf716b2 100644 --- a/tasks/taxon_id/freyja/task_freyja_update.wdl +++ b/tasks/taxon_id/freyja/task_freyja_update.wdl @@ -2,7 +2,7 @@ version 1.0 task freyja_update_refs { input { - String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.1-07_02_2024-01-27-2024-07-22" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/freyja:1.5.2-11_30_2024-02-00-2024-12-02" Int disk_size = 100 Int memory = 16 Int cpu = 4 diff --git a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl new file mode 100644 index 000000000..ab8f98440 --- /dev/null +++ b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl @@ -0,0 +1,62 @@ +version 1.0 + +task fetch_srr_accession { + input { + String sample_accession + String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0" + Int disk_size = 10 + Int cpu = 2 + Int memory = 8 + } + meta { + volatile: true + } + command <<< + set -euo pipefail + + # Output the current date and fastq-dl version for debugging + date -u | tee DATE + fastq-dl --version | tee VERSION + + echo "Fetching metadata for accession: ~{sample_accession}" + + # Run fastq-dl and capture stderr + fastq-dl --accession ~{sample_accession} --only-download-metadata -m 2 --verbose 2> stderr.log || true + + # Handle whether the ID/accession is valid and contains SRR metadata based on stderr + if grep -q "No results found for" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "received an empty response" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then + echo "Invalid accession: ~{sample_accession}" >&2 + exit 1 + elif [[ ! -f fastq-run-info.tsv ]]; then + echo "No metadata file found for accession: ~{sample_accession}" >&2 + exit 1 + else + # Extract SRR accessions from the TSV file if it exists + SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + if [[ -z "${SRR_accessions}" ]]; then + echo "No SRR accession found" > srr_accession.txt + else + echo "Extracted SRR accessions: ${SRR_accessions}" + echo "${SRR_accessions}" > srr_accession.txt + fi + fi + >>> + output { + String srr_accession = read_string("srr_accession.txt") + String fastq_dl_version = read_string("VERSION") + } + runtime { + docker: docker + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + preemptible: 1 + } +} diff --git a/tasks/utilities/file_handling/task_cat_lanes.wdl b/tasks/utilities/file_handling/task_cat_lanes.wdl new file mode 100644 index 000000000..d3755eba9 --- /dev/null +++ b/tasks/utilities/file_handling/task_cat_lanes.wdl @@ -0,0 +1,52 @@ +version 1.0 + +task cat_lanes { + input { + String samplename + + File read1_lane1 + File read1_lane2 + File? read1_lane3 + File? read1_lane4 + + File? read2_lane1 + File? read2_lane2 + File? read2_lane3 + File? read2_lane4 + + Int cpu = 2 + Int disk_size = 50 + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/utility:1.2" + Int memory = 4 + } + meta { + volatile: true + } + command <<< + # exit task if anything throws an error (important for proper gzip format) + set -euo pipefail + + exists() { [[ -f $1 ]]; } + + cat ~{read1_lane1} ~{read1_lane2} ~{read1_lane3} ~{read1_lane4} > "~{samplename}_merged_R1.fastq.gz" + + if exists "~{read2_lane1}" ; then + cat ~{read2_lane1} ~{read2_lane2} ~{read2_lane3} ~{read2_lane4} > "~{samplename}_merged_R2.fastq.gz" + fi + + # ensure newly merged FASTQs are valid gzipped format + gzip -t *merged*.gz + >>> + output { + File read1_concatenated = "~{samplename}_merged_R1.fastq.gz" + File? read2_concatenated = "~{samplename}_merged_R2.fastq.gz" + } + runtime { + docker: "~{docker}" + memory: memory + " GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + preemptible: 1 + } +} \ No newline at end of file diff --git a/tasks/utilities/submission/task_submission.wdl b/tasks/utilities/submission/task_submission.wdl index ab384c86b..effa28619 100644 --- a/tasks/utilities/submission/task_submission.wdl +++ b/tasks/utilities/submission/task_submission.wdl @@ -56,7 +56,7 @@ task prune_table { # read export table into pandas tablename = "~{table_name}-data.tsv" - table = pd.read_csv(tablename, delimiter='\t', header=0, dtype={"~{table_name}_id": 'str'}) # ensure sample_id is always a string) + table = pd.read_csv(tablename, delimiter='\t', header=0, dtype={"~{table_name}_id": 'str', "collection_date": 'str'}) # ensure sample_id is always a string) # extract the samples for upload from the entire table table = table[table["~{table_name}_id"].isin("~{sep='*' sample_names}".split("*"))] diff --git a/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz b/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz new file mode 100644 index 000000000..4dc2a5ec2 Binary files /dev/null and b/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz differ diff --git a/tests/inputs/theiacov/wf_theiacov_clearlabs.json b/tests/inputs/theiacov/wf_theiacov_clearlabs.json index 10351330a..ff8983dd2 100644 --- a/tests/inputs/theiacov/wf_theiacov_clearlabs.json +++ b/tests/inputs/theiacov/wf_theiacov_clearlabs.json @@ -3,5 +3,7 @@ "theiacov_clearlabs.read1": "tests/data/theiacov/fastqs/clearlabs/clearlabs.fastq.gz", "theiacov_clearlabs.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_clearlabs.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", - "theiacov_clearlabs.organism_parameters.gene_locations_bed_file": "tests/inputs/sc2_gene_locations.bed" + "theiacov_clearlabs.organism_parameters.gene_locations_bed_file": "tests/inputs/sc2_gene_locations.bed", + "theiacov_clearlabs.kraken2_raw.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz", + "theiacov_clearlabs.kraken2_dehosted.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_illumina_pe.json b/tests/inputs/theiacov/wf_theiacov_illumina_pe.json index 467bcf94d..d57d12ad4 100644 --- a/tests/inputs/theiacov/wf_theiacov_illumina_pe.json +++ b/tests/inputs/theiacov/wf_theiacov_illumina_pe.json @@ -5,5 +5,6 @@ "theiacov_illumina_pe.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_illumina_pe.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", "theiacov_illumina_pe.reference_gff": "tests/inputs/completely-empty-for-test.txt", - "theiacov_illumina_pe.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_illumina_pe.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_illumina_pe.read_QC_trim.kraken_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_illumina_se.json b/tests/inputs/theiacov/wf_theiacov_illumina_se.json index b9b4381de..7bc27de4b 100644 --- a/tests/inputs/theiacov/wf_theiacov_illumina_se.json +++ b/tests/inputs/theiacov/wf_theiacov_illumina_se.json @@ -4,5 +4,6 @@ "theiacov_illumina_se.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_illumina_se.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", "theiacov_illumina_se.reference_gff": "tests/inputs/completely-empty-for-test.txt", - "theiacov_illumina_se.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_illumina_se.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_illumina_se.read_QC_trim.kraken_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_ont.json b/tests/inputs/theiacov/wf_theiacov_ont.json index 4c551d73b..055ca29d0 100644 --- a/tests/inputs/theiacov/wf_theiacov_ont.json +++ b/tests/inputs/theiacov/wf_theiacov_ont.json @@ -3,5 +3,6 @@ "theiacov_ont.read1": "tests/data/theiacov/fastqs/ont/ont.fastq.gz", "theiacov_ont.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_ont.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", - "theiacov_ont.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_ont.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_ont.read_qc_trim.kraken_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml index a7108f67f..83d78611b 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml @@ -147,7 +147,7 @@ - path: miniwdl_run/call-fastq_scan_raw_reads/work/clearlabs_fastq-scan.json md5sum: 869dd2e934c600bba35f30f08e2da7c9 - path: miniwdl_run/call-kraken2_dehosted/command - md5sum: 0f9db3341b5f58fb8d145d6d94222827 + md5sum: 4306699c67306b103561adf31c3754e3 - path: miniwdl_run/call-kraken2_dehosted/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-kraken2_dehosted/outputs.json @@ -159,18 +159,18 @@ contains: ["wdl", "theiacov_clearlabs", "kraken2_dehosted", "done"] - path: miniwdl_run/call-kraken2_dehosted/work/DATE - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_HUMAN - md5sum: 4fd4dcef994592f9865e9bc8807f32f4 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_SC2 - md5sum: 9fc4759d176a0e0d240c418dbaaafeb2 + md5sum: 86b6b8aa9ad17f169f04c02b0e2bf1b1 - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-kraken2_dehosted/work/VERSION - md5sum: 379b99c23325315c502e74614c035e7d + md5sum: 7ad46f90cd0ffa94f32a6e06299ed05c - path: miniwdl_run/call-kraken2_dehosted/work/_miniwdl_inputs/0/clearlabs_R1_dehosted.fastq.gz - path: miniwdl_run/call-kraken2_dehosted/work/clearlabs_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 + md5sum: b66dbcf8d229c1b6fcfff4dd786068bd - path: miniwdl_run/call-kraken2_raw/command - md5sum: a9dabf08bff8e183fd792901ce24fc57 + md5sum: d6e217901b67290466eec97f13564022 - path: miniwdl_run/call-kraken2_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-kraken2_raw/outputs.json @@ -182,16 +182,16 @@ contains: ["wdl", "theiacov_clearlabs", "kraken2_raw", "done"] - path: miniwdl_run/call-kraken2_raw/work/DATE - path: miniwdl_run/call-kraken2_raw/work/PERCENT_HUMAN - md5sum: 4fd4dcef994592f9865e9bc8807f32f4 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-kraken2_raw/work/PERCENT_SC2 - md5sum: 9fc4759d176a0e0d240c418dbaaafeb2 + md5sum: 86b6b8aa9ad17f169f04c02b0e2bf1b1 - path: miniwdl_run/call-kraken2_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-kraken2_raw/work/VERSION - md5sum: 379b99c23325315c502e74614c035e7d + md5sum: 7ad46f90cd0ffa94f32a6e06299ed05c - path: miniwdl_run/call-kraken2_raw/work/_miniwdl_inputs/0/clearlabs.fastq.gz - path: miniwdl_run/call-kraken2_raw/work/clearlabs_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 + md5sum: b66dbcf8d229c1b6fcfff4dd786068bd - path: miniwdl_run/call-ncbi_scrub_se/command contains: ["read1", "scrubber", "gzip"] - path: miniwdl_run/call-ncbi_scrub_se/inputs.json @@ -234,7 +234,7 @@ - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/genome_annotation.gff3 md5sum: 4dff84d2d6ada820e0e3a8bc6798d402 - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/pathogen.json - md5sum: a51a91e0b5e16590c1afd0c7897ad071 + md5sum: 32f20640f926d5b59fed6b954541792d - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/reference.fasta md5sum: c7ce05f28e4ec0322c96f24e064ef55c - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/sequences.fasta @@ -308,13 +308,13 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 59478efddde2191ead1b46b1f121bbc9 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/clearlabs.medaka.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-pangolin4/work/clearlabs.pangolin_report.csv - md5sum: 151390c419b00ca44eb83e2bbfb96996 + md5sum: 0370f24c270c44f6023dd98af79501e7 - path: miniwdl_run/call-stats_n_coverage/command md5sum: ac020678f99ac145b11d3dbc7b9fe9ba - path: miniwdl_run/call-stats_n_coverage/inputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_fasta.yml b/tests/workflows/theiacov/test_wf_theiacov_fasta.yml index e688eb726..df82166e4 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_fasta.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_fasta.yml @@ -37,7 +37,7 @@ md5sum: 6808ca805661622ad65ae014a4b2a094 - path: miniwdl_run/call-consensus_qc/work/_miniwdl_inputs/0/clearlabs.fasta.gz - path: miniwdl_run/call-nextclade_v3/command - md5sum: 59868097729a0dac73f93a62d57ecd4c + md5sum: 5f142285394dd5432eeda69c8db06444 - path: miniwdl_run/call-nextclade_v3/inputs.json - path: miniwdl_run/call-nextclade_v3/outputs.json - path: miniwdl_run/call-nextclade_v3/stderr.txt @@ -50,22 +50,22 @@ - path: miniwdl_run/call-nextclade_v3/work/clearlabs.fasta.gz.nextclade.auspice.json - path: miniwdl_run/call-nextclade_v3/work/clearlabs.fasta.gz.nextclade.json - path: miniwdl_run/call-nextclade_v3/work/clearlabs.fasta.gz.nextclade.tsv - md5sum: 3aeae954ba64b8ad7db55e08f9c7131c + md5sum: 6f73969f56007a50f230d9768d95daf1 - path: miniwdl_run/call-nextclade_v3/work/nextclade.aligned.fasta md5sum: bf487271d506418ea23fe30fc033e44d - path: miniwdl_run/call-nextclade_v3/work/nextclade.csv - md5sum: 50ca5404982b62cbdf077c5d16543e6f + md5sum: d03e4ca908ab966f2a5c4e6a2a346c74 - path: miniwdl_run/call-nextclade_v3/work/nextclade.ndjson - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/genome_annotation.gff3 md5sum: 4dff84d2d6ada820e0e3a8bc6798d402 - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/pathogen.json - md5sum: a51a91e0b5e16590c1afd0c7897ad071 + md5sum: 32f20640f926d5b59fed6b954541792d - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/reference.fasta md5sum: c7ce05f28e4ec0322c96f24e064ef55c - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/sequences.fasta md5sum: c2a4d6cbb837dce22d81f9c36dd0629e - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/tree.json - md5sum: f5a645741d65a60de34373e9e912b8a1 + md5sum: 82d588f58ef37c713bdc1eb8d2c5c22d - path: miniwdl_run/call-nextclade_v3/work/nextclade.cds_translation.E.fasta md5sum: dc43b1e98245a25c142aec52b29a07df - path: miniwdl_run/call-nextclade_v3/work/nextclade.cds_translation.M.fasta @@ -111,7 +111,7 @@ - path: miniwdl_run/call-nextclade_output_parser/work/_miniwdl_inputs/0/clearlabs.fasta.gz.nextclade.tsv md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-nextclade_output_parser/work/input.tsv - md5sum: 3aeae954ba64b8ad7db55e08f9c7131c + md5sum: 6f73969f56007a50f230d9768d95daf1 - path: miniwdl_run/call-pangolin4/command md5sum: b9c36681b77c5e007bf7e890265d70eb - path: miniwdl_run/call-pangolin4/inputs.json @@ -130,12 +130,12 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 71eba5c871bca955ab2a69dbd2c3c62e - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: e01f9468a9a5490f5743cc0ca76286a7 + md5sum: e5d1adcf421ec6306f35626a6f7c9961 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/clearlabs.fasta.gz - path: miniwdl_run/call-pangolin4/work/fasta.pangolin_report.csv - md5sum: 163d8390eb18b50c7d871edf815d029f + md5sum: 87c7b2dbd5d507949ff6cfddfee22766 - path: miniwdl_run/call-vadr/command md5sum: 9e4318eb5b452da239723882bbcfe352 - path: miniwdl_run/call-vadr/inputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml index 1ff0b33b8..444c41b5d 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml @@ -22,7 +22,7 @@ - path: miniwdl_run/call-raw_check_reads/task.log # trimmomatic - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_pe/command - md5sum: fcc0e853c3719e41f3d169c291dc3927 + md5sum: e16dcea259c3bf45ec374c92da0bf2dd - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_pe/inputs.json contains: ["read1", "read2", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_pe/outputs.json @@ -83,7 +83,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION # kraken2 dehosted - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/command - md5sum: 2031501aaf268d2987b6dbc3b8b32dfa + md5sum: 24a53d050f62bf377558e76cce42ca71 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/inputs.json contains: ["read1", "read2", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/outputs.json @@ -96,14 +96,14 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_HUMAN md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_SC2 - md5sum: 494a4bf9ab740c0a0fab64f670549883 + md5sum: 6baf8bb11094b9011d8dc34e66743712 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/SRR13687078_kraken2_report.txt - md5sum: 2ccc036a9a93b3cf096a5c4dda49a579 + md5sum: 565954ac2bb6ef427754de3b43430728 # kraken2 raw - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/command - md5sum: a16205bdb8cf133a112c4552e8f67f97 + md5sum: 717f1ade3930083c4ca023b999c3bdff - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/outputs.json @@ -114,13 +114,13 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/task.log contains: ["wdl", "theiacov_illumina_pe", "kraken2_theiacov_raw", "done"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_HUMAN - md5sum: 414f4efa514540a2527a4f27124575f2 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_SC2 - md5sum: 2bf2d20f083d8fa09abf6c25f8970e2e + md5sum: cfefab882d84cf0f2a1bde9c19eec318 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/SRR13687078_kraken2_report.txt - md5sum: 3544d9ca35d45093c03cdead46677765 + md5sum: 8ea92e13d401e1c955336edfdcd4f1ba # ncbi scrub - path: miniwdl_run/call-read_QC_trim/call-ncbi_scrub_pe/command md5sum: 8c7ca800fa98305009cfb9116a4b60b8 @@ -364,14 +364,14 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: e98d2fc28664c0622f6b490433286e32 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/SRR13687078.ivar.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e # nextclade - path: miniwdl_run/call-nextclade_v3/command - md5sum: 75c10b0cc6a7c826b84f6b3fa8be5a26 + md5sum: 113378e9114fde0abcf359fda49de568 - path: miniwdl_run/call-nextclade_v3/inputs.json contains: ["dataset_name", "dataset_tag", "genome_fasta"] - path: miniwdl_run/call-nextclade_v3/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml index 99668f641..362fa45d0 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml @@ -73,7 +73,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION # kraken2 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/command - md5sum: ca22e45a62c5c26c4447cdafe75a26ab + md5sum: 3478232c364dc1cf01b6b0300400c26c - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/outputs.json @@ -84,13 +84,13 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/task.log contains: ["wdl", "theiacov_illumina_se", "kraken2_theiacov_raw", "done"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_HUMAN - md5sum: 1576d5d341223ea9d44b0b8a213bb9da + md5sum: 4fd4dcef994592f9865e9bc8807f32f4 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_SC2 - md5sum: 7cc2eb659e21f15fa902b11812eae1f6 + md5sum: adbe14d7547234f3743f80907ed33179 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/ERR6319327_kraken2_report.txt - md5sum: 9a089b8920e55c9cc7bc8cd7d18f9a8e + md5sum: cb58af9eb139d109b55ce65d6d2344d6 # clean read screen - path: miniwdl_run/call-clean_check_reads/command md5sum: 80a361915a627e86743baacfc383b2b5 @@ -316,14 +316,14 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 0b1f8fb5b938fe71631f61234cbf7ab3 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/ERR6319327.ivar.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e # nextclade - path: miniwdl_run/call-nextclade_v3/command - md5sum: a98129345713c75ac2e51ffa465c1703 + md5sum: c5d644127d8eae3f8fb3e3eaecb7fd2e - path: miniwdl_run/call-nextclade_v3/inputs.json contains: ["dataset_name", "dataset_tag", "genome_fasta"] - path: miniwdl_run/call-nextclade_v3/outputs.json @@ -372,7 +372,7 @@ - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/sequences.fasta - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/tree.json - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/pathogen.json - md5sum: a51a91e0b5e16590c1afd0c7897ad071 + md5sum: 32f20640f926d5b59fed6b954541792d - path: miniwdl_run/call-nextclade_v3/work/_miniwdl_inputs/0/ERR6319327.ivar.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e # nextclade output parsing diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml index 333f39b90..1348bce94 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml @@ -205,9 +205,9 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 35aa27af5fb90d54561ee9d45a3163d5 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/ont.medaka.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-pangolin4/work/ont.pangolin_report.csv diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index aad099a4e..d9dfb5218 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -70,81 +70,6 @@ md5sum: 3cfdda0096f0689c9829ed27bdef6b1a - path: miniwdl_run/call-busco/work/_miniwdl_inputs/0/test_contigs.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - - path: miniwdl_run/call-busco/work/busco_downloads/file_versions.tsv - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/ancestral - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/ancestral_variants - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/dataset.cfg - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/101957at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/102178at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/102360at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/98221at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/98657at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/99236at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/99734at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/99842at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/info/ogs.id.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/info/species.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/lengths_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/links_to_ODB10.txt - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/scores_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/ancestral - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/ancestral_variants - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/dataset.cfg - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1009041at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1024388at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1036075at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1043239at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/961486at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/981870at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/984717at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/info/ogs.id.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/info/species.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/lengths_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/links_to_ODB10.txt - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/scores_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/list_of_reference_markers.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/mapping_taxid-lineage.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/mapping_taxids-busco_dataset_name.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/tree.bacteria_odb10.2019-12-16.nwk - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/busco_sequences/fragmented_busco_sequences/108145at2157.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/busco_sequences/fragmented_busco_sequences/108145at2157.fna - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/busco_sequences/single_copy_busco_sequences/84219at2157.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/busco_sequences/single_copy_busco_sequences/84219at2157.fna - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/full_table.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/101957at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/102178at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/102360at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/99734at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/99842at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/missing_busco_list.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/short_summary.json - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/short_summary.txt - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences/1540940at2.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences/1540940at2.fna - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences/1827334at2.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences/1132353at2.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences/1132353at2.fna - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences/1211060at2.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/full_table.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/1009041at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/1024388at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/1036075at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/961486at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/981870at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/984717at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/missing_busco_list.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/placement_files/marker_genes.fasta - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/short_summary.json - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/short_summary.txt - - path: miniwdl_run/call-busco/work/test/logs/busco.log - - path: miniwdl_run/call-busco/work/test/logs/hmmsearch_err.log - - path: miniwdl_run/call-busco/work/test/logs/hmmsearch_out.log - - path: miniwdl_run/call-busco/work/test/logs/prodigal_err.log - - path: miniwdl_run/call-busco/work/test/logs/prodigal_out.log - - path: miniwdl_run/call-busco/work/test/logs/sepp_err.log - - path: miniwdl_run/call-busco/work/test/logs/sepp_out.log - path: miniwdl_run/call-busco/work/test/prodigal_output/predicted_genes/predicted.faa - path: miniwdl_run/call-busco/work/test/prodigal_output/predicted_genes/predicted.fna - path: miniwdl_run/call-busco/work/test/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa @@ -464,7 +389,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/_miniwdl_inputs/0/SRR2838702_R1.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/_miniwdl_inputs/0/SRR2838702_R2.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_pe/command - md5sum: cc137a029d5143592b40edf01d53735f + md5sum: cc961dbda52c70200555ffb34e5ba62d - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_pe/inputs.json contains: ["read", "fastq", "test", "trimmomatic_min_length"] - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_pe/outputs.json @@ -615,23 +540,23 @@ - path: miniwdl_run/wdl/tasks/species_typing/escherichia_shigella/task_sonneityping.wdl md5sum: 3357a36f11992a0ca00c61d7bfccb44b - path: miniwdl_run/wdl/tasks/species_typing/mycobacterium/task_tbprofiler.wdl - md5sum: 3b37e6bf7f4773e12afe1fa15920acd9 + md5sum: a4d6d24a04a453227b4fa320ff79e45f - path: miniwdl_run/wdl/tasks/species_typing/multi/task_ts_mlst.wdl md5sum: ff8070a06eca94264ad6a7d91cb03bf0 - path: miniwdl_run/wdl/tasks/task_versioning.wdl - md5sum: 72d7f4462417909d85f692615e3a658b + md5sum: 92516e789845ffe5d883e03b3e767857 - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 43dd0613df879f91a2f3144e27b38a71 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 850ad97598aca5c28eb36e6a5c13c2fc + md5sum: 8c97c5bd65e2787239f12ef425d479ae - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl - md5sum: d8db687487a45536d4837a540ed2a135 + md5sum: ac49217c129add7c000eedf38acee8f3 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl - md5sum: ea5cff6eff8c2c42046cf2eae6f16b6f + md5sum: f3b18a0b4c2bdeb0896176e8f9c8247d - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl contains: ["version", "QC", "output"] - path: miniwdl_run/workflow.log diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 6a7e2a86a..2b6580c82 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -70,76 +70,6 @@ md5sum: 3cfdda0096f0689c9829ed27bdef6b1a - path: miniwdl_run/call-busco/work/_miniwdl_inputs/0/test_contigs.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - - path: miniwdl_run/call-busco/work/busco_downloads/file_versions.tsv - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/ancestral - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/ancestral_variants - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/dataset.cfg - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/101957at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/102178at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/102360at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/98221at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/98657at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/99236at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/99734at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/hmms/99842at2157.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/info/ogs.id.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/info/species.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/lengths_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/links_to_ODB10.txt - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/archaea_odb10/scores_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/ancestral - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/ancestral_variants - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/dataset.cfg - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1009041at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1024388at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1036075at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/1043239at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/961486at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/981870at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/hmms/984717at2.hmm - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/info/ogs.id.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/info/species.info - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/lengths_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/links_to_ODB10.txt - - path: miniwdl_run/call-busco/work/busco_downloads/lineages/bacteria_odb10/scores_cutoff - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/list_of_reference_markers.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/mapping_taxid-lineage.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/mapping_taxids-busco_dataset_name.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/tree.bacteria_odb10.2019-12-16.nwk - - path: miniwdl_run/call-busco/work/busco_downloads/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/full_table.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/101957at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/102178at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/102360at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/99734at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/hmmer_output/99842at2157.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/missing_busco_list.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/short_summary.json - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_archaea_odb10/short_summary.txt - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences/1540940at2.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences/1540940at2.fna - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences/1132353at2.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences/1132353at2.fna - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences/1505038at2.faa - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/full_table.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/1009041at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/1024388at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/1036075at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/961486at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/981870at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/hmmer_output/984717at2.out - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/missing_busco_list.tsv - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/placement_files/marker_genes.fasta - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/short_summary.json - - path: miniwdl_run/call-busco/work/test/auto_lineage/run_bacteria_odb10/short_summary.txt - - path: miniwdl_run/call-busco/work/test/logs/busco.log - - path: miniwdl_run/call-busco/work/test/logs/hmmsearch_err.log - - path: miniwdl_run/call-busco/work/test/logs/hmmsearch_out.log - - path: miniwdl_run/call-busco/work/test/logs/prodigal_err.log - - path: miniwdl_run/call-busco/work/test/logs/prodigal_out.log - - path: miniwdl_run/call-busco/work/test/logs/sepp_err.log - - path: miniwdl_run/call-busco/work/test/logs/sepp_out.log - path: miniwdl_run/call-busco/work/test/prodigal_output/predicted_genes/predicted.faa - path: miniwdl_run/call-busco/work/test/prodigal_output/predicted_genes/predicted.fna - path: miniwdl_run/call-busco/work/test/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa @@ -578,24 +508,24 @@ - path: miniwdl_run/wdl/tasks/species_typing/escherichia_shigella/task_sonneityping.wdl md5sum: 3357a36f11992a0ca00c61d7bfccb44b - path: miniwdl_run/wdl/tasks/species_typing/mycobacterium/task_tbprofiler.wdl - md5sum: 3b37e6bf7f4773e12afe1fa15920acd9 + md5sum: a4d6d24a04a453227b4fa320ff79e45f - path: miniwdl_run/wdl/tasks/species_typing/multi/task_ts_mlst.wdl md5sum: ff8070a06eca94264ad6a7d91cb03bf0 - path: miniwdl_run/wdl/tasks/task_versioning.wdl - md5sum: 72d7f4462417909d85f692615e3a658b + md5sum: 92516e789845ffe5d883e03b3e767857 - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 43dd0613df879f91a2f3144e27b38a71 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 850ad97598aca5c28eb36e6a5c13c2fc + md5sum: 8c97c5bd65e2787239f12ef425d479ae - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl - md5sum: 4111a758490174325ae8ea52a95319e9 + md5sum: 5e735ae6cb60f86ec7983274f3baf9f8 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl - md5sum: ea5cff6eff8c2c42046cf2eae6f16b6f + md5sum: f3b18a0b4c2bdeb0896176e8f9c8247d - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl - md5sum: a7ef5a7a38dd60ff2edf699ae6808ebb + md5sum: 09d9f68b9ca8bf94b6145ff9bed2edd1 - path: miniwdl_run/workflow.log contains: ["wdl", "theiaprok_illumina_se", "NOTICE", "done"] diff --git a/workflows/freyja/wf_freyja_fastq.wdl b/workflows/freyja/wf_freyja_fastq.wdl index 7b46a204c..b758d3ca4 100644 --- a/workflows/freyja/wf_freyja_fastq.wdl +++ b/workflows/freyja/wf_freyja_fastq.wdl @@ -22,6 +22,7 @@ workflow freyja_fastq { String samplename Int? depth_cutoff Boolean ont = false + String kraken2_target_organism = "Severe acute respiratory syndrome coronavirus 2" } if (defined(read2)) { call read_qc_pe.read_QC_trim_pe as read_QC_trim_pe { @@ -30,7 +31,8 @@ workflow freyja_fastq { read1 = read1, read2 = select_first([read2]), trim_min_length = trimmomatic_min_length, - workflow_series = "theiacov" + workflow_series = "theiacov", + target_organism = kraken2_target_organism } } if (! defined(read2) && ! ont) { @@ -39,7 +41,8 @@ workflow freyja_fastq { samplename = samplename, read1 = read1, trim_min_length = trimmomatic_min_length, - workflow_series = "theiacov" + workflow_series = "theiacov", + target_organism = kraken2_target_organism } } if (ont) { @@ -57,7 +60,8 @@ workflow freyja_fastq { input: samplename = samplename, read1 = read1, - workflow_series = "theiacov" + workflow_series = "theiacov", + target_organism = kraken2_target_organism } call nanoplot_task.nanoplot as nanoplot_clean { input: @@ -177,10 +181,10 @@ workflow freyja_fastq { # Read QC - kraken outputs - all String kraken_version = select_first([read_QC_trim_pe.kraken_version, read_QC_trim_se.kraken_version, read_QC_trim_ont.kraken_version]) Float kraken_human = select_first([read_QC_trim_pe.kraken_human, read_QC_trim_se.kraken_human, read_QC_trim_ont.kraken_human]) - Float kraken_sc2 = select_first([read_QC_trim_pe.kraken_sc2, read_QC_trim_se.kraken_sc2, read_QC_trim_ont.kraken_sc2]) + String kraken_sc2 = select_first([read_QC_trim_pe.kraken_sc2, read_QC_trim_se.kraken_sc2, read_QC_trim_ont.kraken_sc2]) String kraken_report = select_first([read_QC_trim_pe.kraken_report, read_QC_trim_se.kraken_report, read_QC_trim_ont.kraken_report]) Float kraken_human_dehosted = select_first([read_QC_trim_pe.kraken_human_dehosted, read_QC_trim_se.kraken_human_dehosted, read_QC_trim_ont.kraken_human_dehosted]) - Float kraken_sc2_dehosted = select_first([read_QC_trim_pe.kraken_sc2_dehosted, read_QC_trim_se.kraken_sc2_dehosted, read_QC_trim_ont.kraken_sc2_dehosted]) + String kraken_sc2_dehosted = select_first([read_QC_trim_pe.kraken_sc2_dehosted, read_QC_trim_se.kraken_sc2_dehosted, read_QC_trim_ont.kraken_sc2_dehosted]) File kraken_report_dehosted = select_first([read_QC_trim_pe.kraken_report_dehosted, read_QC_trim_se.kraken_report_dehosted, read_QC_trim_ont.kraken_report_dehosted]) # Read Alignment - bwa outputs String? bwa_version = bwa.bwa_version @@ -204,7 +208,7 @@ workflow freyja_fastq { File freyja_depths = freyja.freyja_depths File freyja_demixed = freyja.freyja_demixed Float freyja_coverage = freyja.freyja_coverage - File freyja_usher_barcode_file = freyja.freyja_usher_barcode_file + File freyja_barcode_file = freyja.freyja_barcode_file File freyja_lineage_metadata_file = freyja.freyja_lineage_metadata_file String freyja_barcode_version = freyja.freyja_barcode_version String freyja_metadata_version = freyja.freyja_metadata_version diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl index bb003b705..7c76b02b1 100644 --- a/workflows/phylogenetics/wf_augur.wdl +++ b/workflows/phylogenetics/wf_augur.wdl @@ -200,10 +200,14 @@ workflow augur { String augur_version = augur_tree.augur_version # augur outputs + String? augur_mafft_version = augur_align.mafft_version File? auspice_input_json = augur_export.auspice_json File? time_tree = augur_refine.refined_tree File distance_tree = augur_tree.aligned_tree String augur_iqtree_model_used = augur_tree.iqtree_model_used + String augur_iqtree_version = augur_tree.iqtree_version + String augur_fasttree_version = augur_tree.fasttree_version + String augur_raxml_version = augur_tree.raxml_version File aligned_fastas = select_first([augur_align.aligned_fasta, alignment_fasta]) File combined_assemblies = filter_sequences_by_length.filtered_fasta File? metadata_merged = tsv_join.out_tsv diff --git a/workflows/standalone_modules/wf_tbprofiler_tngs.wdl b/workflows/standalone_modules/wf_tbprofiler_tngs.wdl index 85f75a665..2d84d91ed 100644 --- a/workflows/standalone_modules/wf_tbprofiler_tngs.wdl +++ b/workflows/standalone_modules/wf_tbprofiler_tngs.wdl @@ -71,7 +71,7 @@ workflow tbprofiler_tngs { String tbprofiler_num_dr_variants = tbprofiler.tbprofiler_num_dr_variants String tbprofiler_num_other_variants = tbprofiler.tbprofiler_num_other_variants String tbprofiler_resistance_genes = tbprofiler.tbprofiler_resistance_genes - Int tbprofiler_median_coverage = tbprofiler.tbprofiler_median_coverage + Float tbprofiler_median_depth = tbprofiler.tbprofiler_median_depth Float tbprofiler_pct_reads_mapped = tbprofiler.tbprofiler_pct_reads_mapped # tbp_parser outputs File tbp_parser_looker_report_csv = tbp_parser.tbp_parser_looker_report_csv diff --git a/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl b/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl index 3cbedd30a..6d4acd8ca 100644 --- a/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl +++ b/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl @@ -9,6 +9,7 @@ workflow dehost_pe { String samplename File read1 File read2 + String target_organism = "Severe acute respiratory syndrome coronavirus 2" } call ncbi_scrub.ncbi_scrub_pe { input: @@ -20,7 +21,8 @@ workflow dehost_pe { input: samplename = samplename, read1 = ncbi_scrub_pe.read1_dehosted, - read2 = ncbi_scrub_pe.read2_dehosted + read2 = ncbi_scrub_pe.read2_dehosted, + target_organism = target_organism } call versioning.version_capture { input: @@ -33,7 +35,7 @@ workflow dehost_pe { Int ncbi_scrub_human_spots_removed = ncbi_scrub_pe.human_spots_removed String ncbi_scrub_docker = ncbi_scrub_pe.ncbi_scrub_docker Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 + String kraken_sc2_dehosted = kraken2.percent_sc2 File kraken_report_dehosted = kraken2.kraken_report String kraken_version_dehosted = kraken2.version } diff --git a/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl b/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl index 7e0a25d88..23a8d707d 100644 --- a/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl +++ b/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl @@ -8,6 +8,7 @@ workflow dehost_se { input { String samplename File read1 + String target_organism = "Severe acute respiratory syndrome coronavirus 2" } call ncbi_scrub.ncbi_scrub_se { input: @@ -17,7 +18,8 @@ workflow dehost_se { call kraken.kraken2_theiacov as kraken2 { input: samplename = samplename, - read1 = ncbi_scrub_se.read1_dehosted + read1 = ncbi_scrub_se.read1_dehosted, + target_organism = target_organism } call versioning.version_capture { input: @@ -29,7 +31,7 @@ workflow dehost_se { String ncbi_scrub_docker = ncbi_scrub_se.ncbi_scrub_docker Int ncbi_scrub_human_spots_removed = ncbi_scrub_se.human_spots_removed Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 + String kraken_sc2_dehosted = kraken2.percent_sc2 String kraken_version_dehosted = kraken2.version File kraken_report_dehosted = kraken2.kraken_report } diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl index 5774e02f7..d63f61c0f 100644 --- a/workflows/theiacov/wf_theiacov_clearlabs.wdl +++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl @@ -176,12 +176,12 @@ workflow theiacov_clearlabs { # Read QC - kraken outputs String kraken_version = kraken2_raw.version Float kraken_human = kraken2_raw.percent_human - Float kraken_sc2 = kraken2_raw.percent_sc2 + String kraken_sc2 = kraken2_raw.percent_sc2 String kraken_target_organism = kraken2_raw.percent_target_organism String kraken_target_organism_name = organism_parameters.kraken_target_organism File kraken_report = kraken2_raw.kraken_report Float kraken_human_dehosted = kraken2_dehosted.percent_human - Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 + String kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 String kraken_target_organism_dehosted = kraken2_dehosted.percent_target_organism File kraken_report_dehosted = kraken2_dehosted.kraken_report # Read Alignment - Artic consensus outputs diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index 29585659e..7bf1fc36a 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -293,12 +293,12 @@ workflow theiacov_illumina_pe { # Read QC - kraken outputs String? kraken_version = read_QC_trim.kraken_version Float? kraken_human = read_QC_trim.kraken_human - Float? kraken_sc2 = read_QC_trim.kraken_sc2 + String? kraken_sc2 = read_QC_trim.kraken_sc2 String? kraken_target_organism = read_QC_trim.kraken_target_organism String? kraken_target_organism_name = read_QC_trim.kraken_target_organism_name File? kraken_report = read_QC_trim.kraken_report Float? kraken_human_dehosted = read_QC_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted + String? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted String? kraken_target_organism_dehosted = read_QC_trim.kraken_target_organism_dehosted File? kraken_report_dehosted = read_QC_trim.kraken_report_dehosted # Read Alignment - bwa outputs diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index 0de516664..0a92ef2fc 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -236,12 +236,12 @@ workflow theiacov_illumina_se { # Read QC - kraken outputs String? kraken_version = read_QC_trim.kraken_version Float? kraken_human = read_QC_trim.kraken_human - Float? kraken_sc2 = read_QC_trim.kraken_sc2 + String? kraken_sc2 = read_QC_trim.kraken_sc2 String? kraken_target_organism = read_QC_trim.kraken_target_organism String? kraken_target_organism_name = read_QC_trim.kraken_target_organism_name File? kraken_report = read_QC_trim.kraken_report Float? kraken_human_dehosted = read_QC_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted + String? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted String? kraken_target_organism_dehosted = read_QC_trim.kraken_target_organism_dehosted File? kraken_report_dehosted = read_QC_trim.kraken_report_dehosted # Read Alignment - bwa outputs diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index 7d8d29ad4..d2bab2ad7 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -288,12 +288,12 @@ workflow theiacov_ont { String? kraken_target_organism_name = read_qc_trim.kraken_target_organism_name # Read QC - kraken outputs raw Float? kraken_human = read_qc_trim.kraken_human - Float? kraken_sc2 = read_qc_trim.kraken_sc2 + String? kraken_sc2 = read_qc_trim.kraken_sc2 String? kraken_target_organism = read_qc_trim.kraken_target_organism File? kraken_report = read_qc_trim.kraken_report # Read QC - kraken outputs dehosted Float? kraken_human_dehosted = read_qc_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_qc_trim.kraken_sc2_dehosted + String? kraken_sc2_dehosted = read_qc_trim.kraken_sc2_dehosted String? kraken_target_organism_dehosted = read_qc_trim.kraken_target_organism_dehosted File? kraken_report_dehosted = read_qc_trim.kraken_report_dehosted # Read Alignment - Artic consensus outputs diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index 32271224e..88f0ef066 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -18,6 +18,7 @@ import "../../tasks/task_versioning.wdl" as versioning import "../../tasks/taxon_id/contamination/task_kmerfinder.wdl" as kmerfinder_task import "../../tasks/taxon_id/task_gambit.wdl" as gambit_task import "../../tasks/utilities/data_export/task_broad_terra_tools.wdl" as terra_tools +import "../utilities/file_handling/wf_concatenate_illumina_lanes.wdl" as concatenate_lanes_workflow import "../utilities/wf_merlin_magic.wdl" as merlin_magic_workflow import "../utilities/wf_read_QC_trim_pe.wdl" as read_qc @@ -30,6 +31,15 @@ workflow theiaprok_illumina_pe { String seq_method = "ILLUMINA" File read1 File read2 + + # optional additional lanes + File? read1_lane2 + File? read1_lane3 + File? read1_lane4 + File? read2_lane2 + File? read2_lane3 + File? read2_lane4 + Int? genome_length # export taxon table parameters String? run_id @@ -68,10 +78,24 @@ workflow theiaprok_illumina_pe { call versioning.version_capture { input: } + if (defined(read1_lane2)) { + call concatenate_lanes_workflow.concatenate_illumina_lanes { + input: + samplename = samplename, + read1_lane1 = read1, + read1_lane2 = select_first([read1_lane2]), + read1_lane3 = read1_lane3, + read1_lane4 = read1_lane4, + read2_lane1 = read2, + read2_lane2 = read2_lane2, + read2_lane3 = read2_lane3, + read2_lane4 = read2_lane4 + } + } call screen.check_reads as raw_check_reads { input: - read1 = read1, - read2 = read2, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), + read2 = select_first([concatenate_illumina_lanes.read2_concatenated, read2]), min_reads = min_reads, min_basepairs = min_basepairs, min_genome_length = min_genome_length, @@ -85,8 +109,8 @@ workflow theiaprok_illumina_pe { call read_qc.read_QC_trim_pe as read_QC_trim { input: samplename = samplename, - read1 = read1, - read2 = read2, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), + read2 = select_first([concatenate_illumina_lanes.read2_concatenated, read2]), trim_min_length = trim_min_length, trim_quality_min_score = trim_quality_min_score, trim_window_size = trim_window_size, @@ -121,8 +145,8 @@ workflow theiaprok_illumina_pe { } call cg_pipeline.cg_pipeline as cg_pipeline_raw { input: - read1 = read1, - read2 = read2, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), + read2 = select_first([concatenate_illumina_lanes.read2_concatenated, read2]), samplename = samplename, genome_length = select_first([genome_length, quast.genome_length]) } @@ -257,8 +281,8 @@ workflow theiaprok_illumina_pe { sample_taxon = gambit.gambit_predicted_taxon, taxon_tables = taxon_tables, samplename = samplename, - read1 = read1, - read2 = read2, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), + read2 = select_first([concatenate_illumina_lanes.read2_concatenated, read2]), read1_clean = read_QC_trim.read1_clean, read2_clean = read_QC_trim.read2_clean, run_id = run_id, @@ -608,6 +632,9 @@ workflow theiaprok_illumina_pe { String theiaprok_illumina_pe_analysis_date = version_capture.date # Read Metadata String seq_platform = seq_method + # Concatenated Illumina Reads + File? read1_concatenated = concatenate_illumina_lanes.read1_concatenated + File? read2_concatenated = concatenate_illumina_lanes.read2_concatenated # Sample Screening String read_screen_raw = raw_check_reads.read_screen String? read_screen_clean = clean_check_reads.read_screen @@ -945,7 +972,7 @@ workflow theiaprok_illumina_pe { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - Int? tbprofiler_median_coverage = merlin_magic.tbprofiler_median_coverage + Float? tbprofiler_median_depth = merlin_magic.tbprofiler_median_depth Float? tbprofiler_pct_reads_mapped = merlin_magic.tbprofiler_pct_reads_mapped String? tbp_parser_version = merlin_magic.tbp_parser_version String? tbp_parser_docker = merlin_magic.tbp_parser_docker diff --git a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl index e743ecbce..9005766a4 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl @@ -18,6 +18,7 @@ import "../../tasks/task_versioning.wdl" as versioning import "../../tasks/taxon_id/contamination/task_kmerfinder.wdl" as kmerfinder_task import "../../tasks/taxon_id/task_gambit.wdl" as gambit_task import "../../tasks/utilities/data_export/task_broad_terra_tools.wdl" as terra_tools +import "../utilities/file_handling/wf_concatenate_illumina_lanes.wdl" as concatenate_lanes_workflow import "../utilities/wf_merlin_magic.wdl" as merlin_magic_workflow import "../utilities/wf_read_QC_trim_se.wdl" as read_qc @@ -29,6 +30,12 @@ workflow theiaprok_illumina_se { String samplename String seq_method = "ILLUMINA" File read1 + + # optional additional lanes + File? read1_lane2 + File? read1_lane3 + File? read1_lane4 + Int? genome_length # export taxon table parameters String? run_id @@ -68,9 +75,19 @@ workflow theiaprok_illumina_se { call versioning.version_capture { input: } + if (defined(read1_lane2)) { + call concatenate_lanes_workflow.concatenate_illumina_lanes { + input: + samplename = samplename, + read1_lane1 = read1, + read1_lane2 = select_first([read1_lane2]), + read1_lane3 = read1_lane3, + read1_lane4 = read1_lane4 + } + } call screen.check_reads_se as raw_check_reads { input: - read1 = read1, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), min_reads = min_reads, min_basepairs = min_basepairs, min_genome_length = min_genome_length, @@ -84,7 +101,7 @@ workflow theiaprok_illumina_se { call read_qc.read_QC_trim_se as read_QC_trim { input: samplename = samplename, - read1 = read1, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), trim_min_length = trim_min_length, trim_quality_min_score = trim_quality_min_score, trim_window_size = trim_window_size, @@ -116,7 +133,7 @@ workflow theiaprok_illumina_se { } call cg_pipeline.cg_pipeline as cg_pipeline_raw { input: - read1 = read1, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), samplename = samplename, genome_length = select_first([genome_length, quast.genome_length]) } @@ -240,7 +257,7 @@ workflow theiaprok_illumina_se { sample_taxon = gambit.gambit_predicted_taxon, taxon_tables = taxon_tables, samplename = samplename, - read1 = read1, + read1 = select_first([concatenate_illumina_lanes.read1_concatenated, read1]), read1_clean = read_QC_trim.read1_clean, run_id = run_id, collection_date = collection_date, @@ -566,6 +583,8 @@ workflow theiaprok_illumina_se { String theiaprok_illumina_se_analysis_date = version_capture.date # Read Metadata String seq_platform = seq_method + # Concatenated Illumina Reads + File? read1_concatenated = concatenate_illumina_lanes.read1_concatenated # Sample Screening String read_screen_raw = raw_check_reads.read_screen String? read_screen_clean = clean_check_reads.read_screen diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index a7eb9143e..867e67e0b 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -848,7 +848,7 @@ workflow theiaprok_ont { String? tbprofiler_sub_lineage = merlin_magic.tbprofiler_sub_lineage String? tbprofiler_dr_type = merlin_magic.tbprofiler_dr_type String? tbprofiler_resistance_genes = merlin_magic.tbprofiler_resistance_genes - Int? tbprofiler_median_coverage = merlin_magic.tbprofiler_median_coverage + Float? tbprofiler_median_depth = merlin_magic.tbprofiler_median_depth Float? tbprofiler_pct_reads_mapped = merlin_magic.tbprofiler_pct_reads_mapped String? tbp_parser_version = merlin_magic.tbp_parser_version String? tbp_parser_docker = merlin_magic.tbp_parser_docker diff --git a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl new file mode 100644 index 000000000..e40e54a0f --- /dev/null +++ b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl @@ -0,0 +1,26 @@ +version 1.0 + +import "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl" as srr_task +import "../../../tasks/task_versioning.wdl" as versioning_task + +workflow fetch_srr_accession { + meta { + description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession." + } + input { + String sample_accession + } + call versioning_task.version_capture { + input: + } + call srr_task.fetch_srr_accession as fetch_srr { + input: + sample_accession = sample_accession + } + output { + String srr_accession = fetch_srr.srr_accession + # Version Captures + String fetch_srr_accession_version = version_capture.phb_version + String fetch_srr_accession_analysis_date = version_capture.date + } +} diff --git a/workflows/utilities/file_handling/wf_concatenate_illumina_lanes.wdl b/workflows/utilities/file_handling/wf_concatenate_illumina_lanes.wdl new file mode 100644 index 000000000..f2a5a9ad9 --- /dev/null +++ b/workflows/utilities/file_handling/wf_concatenate_illumina_lanes.wdl @@ -0,0 +1,42 @@ +version 1.0 + +import "../../../tasks/utilities/file_handling/task_cat_lanes.wdl" as concatenate_lanes +import "../../../tasks/task_versioning.wdl" as versioning + +workflow concatenate_illumina_lanes { + input { + String samplename + + File read1_lane1 + File read1_lane2 + File? read1_lane3 + File? read1_lane4 + + File? read2_lane1 + File? read2_lane2 + File? read2_lane3 + File? read2_lane4 + } + call concatenate_lanes.cat_lanes { + input: + samplename = samplename, + read1_lane1 = read1_lane1, + read2_lane1 = read2_lane1, + read1_lane2 = read1_lane2, + read2_lane2 = read2_lane2, + read1_lane3 = read1_lane3, + read2_lane3 = read2_lane3, + read1_lane4 = read1_lane4, + read2_lane4 = read2_lane4 + } + call versioning.version_capture { + input: + } + output { + String concatenate_illumina_lanes_version = version_capture.phb_version + String concatenate_illumina_lanes_analysis_date = version_capture.date + + File read1_concatenated = cat_lanes.read1_concatenated + File? read2_concatenated = cat_lanes.read2_concatenated + } +} \ No newline at end of file diff --git a/workflows/utilities/wf_merlin_magic.wdl b/workflows/utilities/wf_merlin_magic.wdl index f10060851..1e19184bd 100644 --- a/workflows/utilities/wf_merlin_magic.wdl +++ b/workflows/utilities/wf_merlin_magic.wdl @@ -57,7 +57,7 @@ workflow merlin_magic { # activating tool logic Boolean call_poppunk = true Boolean call_shigeifinder_reads_input = false - Boolean tbprofiler_additional_outputs = false # set to true to run tbp-parser + Boolean call_tbp_parser = false # docker options String? abricate_abaum_docker_image String? abricate_vibrio_docker_image @@ -197,14 +197,14 @@ workflow merlin_magic { Int srst2_gene_max_mismatch = 2000 # tbprofiler options Boolean tbprofiler_run_custom_db = false + Boolean tbprofiler_run_cdph_db = false File? tbprofiler_custom_db - Int? tbprofiler_cov_frac_threshold Float? tbprofiler_min_af - Float? tbprofiler_min_af_pred Int? tbprofiler_min_depth String? tbprofiler_mapper String? tbprofiler_variant_caller String? tbprofiler_variant_calling_params + String? tbprofiler_additional_parameters # tbp-parser options String tbp_parser_output_seq_method_type = "WGS" String? tbp_parser_operator @@ -215,6 +215,14 @@ workflow merlin_magic { File? tbp_parser_coverage_regions_bed Boolean? tbp_parser_debug Boolean? tbp_parser_add_cs_lims + Boolean? tbp_parser_tngs_data + Float? tbp_parser_rrs_frequency + Int? tbp_parser_rrs_read_support + Float? tbp_parser_rrl_frequency + Int? tbp_parser_rrl_read_support + Float? tbp_parser_rpob449_frequency + Float? tbp_parser_etha237_frequency + File? tbp_parser_expert_rule_regions_bed # virulencefinder options Float? virulencefinder_coverage_threshold Float? virulencefinder_identity_threshold @@ -448,18 +456,18 @@ workflow merlin_magic { read2 = select_first([clockwork_decon_reads.clockwork_cleaned_read2, read2, "gs://theiagen-public-files/terra/theiaprok-files/no-read2.txt"]), samplename = samplename, ont_data = ont_data, - tbprofiler_run_custom_db = tbprofiler_run_custom_db, - tbprofiler_custom_db = tbprofiler_custom_db, - cov_frac_threshold = tbprofiler_cov_frac_threshold, - min_af = tbprofiler_min_af, - min_af_pred = tbprofiler_min_af_pred, - min_depth = tbprofiler_min_depth, mapper = tbprofiler_mapper, variant_caller = tbprofiler_variant_caller, variant_calling_params = tbprofiler_variant_calling_params, + additional_parameters = tbprofiler_additional_parameters, + min_depth = tbprofiler_min_depth, + min_af = tbprofiler_min_af, + tbprofiler_custom_db = tbprofiler_custom_db, + tbprofiler_run_custom_db = tbprofiler_run_custom_db, + tbprofiler_run_cdph_db = tbprofiler_run_cdph_db, docker = tbprofiler_docker_image } - if (tbprofiler_additional_outputs) { + if (call_tbp_parser) { call tbp_parser_task.tbp_parser { input: tbprofiler_json = tbprofiler.tbprofiler_output_json, @@ -468,13 +476,21 @@ workflow merlin_magic { samplename = samplename, sequencing_method = tbp_parser_output_seq_method_type, operator = tbp_parser_operator, - coverage_threshold = tbp_parser_coverage_threshold, - coverage_regions_bed = tbp_parser_coverage_regions_bed, min_depth = tbp_parser_min_depth, min_frequency = tbp_parser_min_frequency, min_read_support = tbp_parser_min_read_support, - tbp_parser_debug = tbp_parser_debug, + coverage_threshold = tbp_parser_coverage_threshold, + coverage_regions_bed = tbp_parser_coverage_regions_bed, add_cycloserine_lims = tbp_parser_add_cs_lims, + tbp_parser_debug = tbp_parser_debug, + tngs_data = tbp_parser_tngs_data, + rrs_frequency = tbp_parser_rrs_frequency, + rrs_read_support = tbp_parser_rrs_read_support, + rrl_frequency = tbp_parser_rrl_frequency, + rrl_read_support = tbp_parser_rrl_read_support, + rpob449_frequency = tbp_parser_rpob449_frequency, + etha237_frequency = tbp_parser_etha237_frequency, + expert_rule_regions_bed = tbp_parser_expert_rule_regions_bed, docker = tbp_parser_docker_image } } @@ -896,7 +912,7 @@ workflow merlin_magic { String? tbprofiler_sub_lineage = tbprofiler.tbprofiler_sub_lineage String? tbprofiler_dr_type = tbprofiler.tbprofiler_dr_type String? tbprofiler_resistance_genes = tbprofiler.tbprofiler_resistance_genes - Int? tbprofiler_median_coverage = tbprofiler.tbprofiler_median_coverage + Float? tbprofiler_median_depth = tbprofiler.tbprofiler_median_depth Float? tbprofiler_pct_reads_mapped = tbprofiler.tbprofiler_pct_reads_mapped String? tbp_parser_version = tbp_parser.tbp_parser_version String? tbp_parser_docker = tbp_parser.tbp_parser_docker diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index 3c88ccb0f..9e2b648fc 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -52,9 +52,10 @@ workflow organism_parameters { String sc2_org_name = "sars-cov-2" String sc2_reference_genome = "gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta" String sc2_gene_locations_bed = "gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed" - String sc2_nextclade_ds_tag = "2024-07-17--12-57-03Z" + String sc2_nextclade_ds_tag = "2024-11-19--14-18-53Z" String sc2_nextclade_ds_name = "nextstrain/sars-cov-2/wuhan-hu-1/orfs" - String sc2_pangolin_docker = "us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29" + String sc2_kraken_target_organism = "Severe acute respiratory syndrome coronavirus 2" + String sc2_pangolin_docker = "us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.31" Int sc2_genome_len = 29903 Int sc2_vadr_max_length = 30000 Int sc2_vadr_skip_length = 10000 @@ -65,7 +66,7 @@ workflow organism_parameters { String mpox_org_name = "MPXV" String mpox_reference_genome = "gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta" String mpox_gene_locations_bed = "gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed" - String mpox_nextclade_ds_tag = "2024-04-19--07-50-39Z" + String mpox_nextclade_ds_tag = "2024-11-19--14-18-53Z" String mpox_nextclade_ds_name = "nextstrain/mpox/lineage-b.1" String mpox_kraken_target_organism = "Monkeypox virus" String mpox_primer_bed_file = "gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed" @@ -124,7 +125,7 @@ workflow organism_parameters { if (flu_subtype == "H1N1") { String h1n1_ha_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.fasta" String h1n1_ha_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.gb" - String h1n1_ha_nextclade_ds_tag = "2024-07-03--08-29-55Z" + String h1n1_ha_nextclade_ds_tag = "2024-11-27--02-51-00Z" String h1n1_ha_nextclade_ds_name = "nextstrain/flu/h1n1pdm/ha/MW626062" String h1n1_ha_clades_tsv = "gs://theiagen-public-files-rp/terra/flu-references/clades_h1n1pdm_ha.tsv" String h1n1_ha_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h1n1pdm.json" @@ -132,7 +133,7 @@ workflow organism_parameters { if (flu_subtype == "H3N2") { String h3n2_ha_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.fasta" String h3n2_ha_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.gb" - String h3n2_ha_nextclade_ds_tag = "2024-08-08--05-08-21Z" + String h3n2_ha_nextclade_ds_tag = "2024-11-27--02-51-00Z" String h3n2_ha_nextclade_ds_name = "nextstrain/flu/h3n2/ha/EPI1857216" String h3n2_ha_clades_tsv = "gs://theiagen-public-files-rp/terra/flu-references/clades_h3n2_ha.tsv" String h3n2_ha_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h3n2.json" @@ -140,7 +141,7 @@ workflow organism_parameters { if (flu_subtype == "Victoria") { String vic_ha_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.fasta" String vic_ha_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.gb" - String vic_ha_nextclade_ds_tag = "2024-07-03--08-29-55Z" + String vic_ha_nextclade_ds_tag = "2024-11-05--09-19-52Z" String vic_ha_nextclade_ds_name = "nextstrain/flu/vic/ha/KX058884" String vic_ha_clades_tsv = "gs://theiagen-public-files-rp/terra/flu-references/clades_vic_ha.tsv" String vic_ha_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_vic.json" @@ -157,7 +158,7 @@ workflow organism_parameters { # H5N1 is a special case where the dataset used is the h5nx all clades dataset String h5n1_ha_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h5n1_ha.fasta" String h5n1_ha_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h5n1_ha.gb" - String h5n1_ha_nextclade_ds_tag = "2024-05-08--11-39-52Z" + String h5n1_ha_nextclade_ds_tag = "2024-12-04--17-05-31Z" String h5n1_ha_nextclade_ds_name = "community/moncla-lab/iav-h5/ha/all-clades" String h5n1_ha_clades_tsv = "gs://theiagen-public-files-rp/terra/flu-references/h5nx-clades.tsv" String h5n1_ha_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h5n1.json" @@ -167,21 +168,21 @@ workflow organism_parameters { if (flu_subtype == "H1N1") { String h1n1_na_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.fasta" String h1n1_na_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.gb" - String h1n1_na_nextclade_ds_tag = "2024-07-03--08-29-55Z" + String h1n1_na_nextclade_ds_tag = "2024-11-05--09-19-52Z" String h1n1_na_nextclade_ds_name = "nextstrain/flu/h1n1pdm/na/MW626056" String h1n1_na_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h1n1pdm.json" } if (flu_subtype == "H3N2") { String h3n2_na_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.fasta" String h3n2_na_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.gb" - String h3n2_na_nextclade_ds_tag = "2024-04-19--07-50-39Z" + String h3n2_na_nextclade_ds_tag = "2024-11-05--09-19-52Z" String h3n2_na_nextclade_ds_name = "nextstrain/flu/h3n2/na/EPI1857215" String h3n2_na_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h3n2.json" } if (flu_subtype == "Victoria") { String vic_na_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_vic_na.fasta" String vic_na_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_yam_na.gb" - String vic_na_nextclade_ds_tag = "2024-04-19--07-50-39Z" + String vic_na_nextclade_ds_tag = "2024-11-05--09-19-52Z" String vic_na_nextclade_ds_name = "nextstrain/flu/vic/na/CY073894" String vic_na_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_vic.json" } @@ -197,10 +198,10 @@ workflow organism_parameters { if (organism == "rsv_a" || organism == "rsv-a" || organism == "RSV-A" || organism == "RSV_A") { String rsv_a_org_name = "rsv_a" String rsv_a_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta" - String rsv_a_nextclade_ds_tag = "2024-08-01--22-31-31Z" + String rsv_a_nextclade_ds_tag = "2024-11-27--02-51-00Z" String rsv_a_nextclade_ds_name = "nextstrain/rsv/a/EPI_ISL_412866" Int rsv_a_genome_len = 15500 - String rsv_a_kraken_target_organism = "Respiratory syncytial virus" + String rsv_a_kraken_target_organism = "Human respiratory syncytial virus A" String rsv_a_vadr_options = "-r --mkey rsv --xnocomp" Int rsv_a_vadr_max_length = 15500 Int rsv_a_vadr_skip_length = 5000 @@ -221,10 +222,10 @@ workflow organism_parameters { if (organism == "rsv_b" || organism == "rsv-b" || organism == "RSV-B" || organism == "RSV_B") { String rsv_b_org_name = "rsv_b" String rsv_b_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta" - String rsv_b_nextclade_ds_tag = "2024-08-01--22-31-31Z" + String rsv_b_nextclade_ds_tag = "2024-11-27--02-51-00Z" String rsv_b_nextclade_ds_name = "nextstrain/rsv/b/EPI_ISL_1653999" Int rsv_b_genome_len = 15500 - String rsv_b_kraken_target_organism = "Human orthopneumovirus" + String rsv_b_kraken_target_organism = "human respiratory syncytial virus" String rsv_b_vadr_options = "-r --mkey rsv --xnocomp" Int rsv_b_vadr_max_length = 15500 Int rsv_b_vadr_skip_length = 5000 @@ -279,7 +280,7 @@ workflow organism_parameters { Int vadr_memory = select_first([vadr_mem, sc2_vadr_memory, mpox_vadr_memory, wnv_vadr_memory, flu_vadr_memory, rsv_a_vadr_memory, rsv_b_vadr_memory, 0]) Int vadr_skiplength = select_first([vadr_skip_length, sc2_vadr_skip_length, mpox_vadr_skip_length, wnv_vadr_skip_length, flu_vadr_skip_length, rsv_a_vadr_skip_length, rsv_b_vadr_skip_length, 0]) # kraken options - String kraken_target_organism = select_first([kraken_target_organism_input, mpox_kraken_target_organism, wnv_kraken_target_organism, hiv_v1_target_organism, hiv_v2_target_organism, rsv_a_kraken_target_organism, rsv_b_kraken_target_organism, ""]) + String kraken_target_organism = select_first([kraken_target_organism_input, sc2_kraken_target_organism, mpox_kraken_target_organism, wnv_kraken_target_organism, hiv_v1_target_organism, hiv_v2_target_organism, rsv_a_kraken_target_organism, rsv_b_kraken_target_organism, ""]) # augur options Int augur_min_num_unambig = select_first([min_num_unambig, mpox_min_num_unambig, flu_min_num_unambig, rsv_a_min_num_unambig, rsv_b_min_num_unambig, 0]) File augur_clades_tsv = select_first([clades_tsv, h1n1_ha_clades_tsv, h3n2_ha_clades_tsv, vic_ha_clades_tsv, yam_ha_clades_tsv, h5n1_ha_clades_tsv, rsv_a_clades_tsv, rsv_b_clades_tsv, mpox_clades_tsv, "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-clades.tsv"]) diff --git a/workflows/utilities/wf_read_QC_trim_ont.wdl b/workflows/utilities/wf_read_QC_trim_ont.wdl index 5b84562aa..8cc609346 100644 --- a/workflows/utilities/wf_read_QC_trim_ont.wdl +++ b/workflows/utilities/wf_read_QC_trim_ont.wdl @@ -56,7 +56,11 @@ workflow read_QC_trim_ont { input: samplename = samplename, read1 = read1, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_raw { input: @@ -69,7 +73,11 @@ workflow read_QC_trim_ont { input: samplename = samplename, read1 = ncbi_scrub_se.read1_dehosted, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_dehosted { input: @@ -126,16 +134,16 @@ workflow read_QC_trim_ont { # ncbi scrub outputs File? read1_dehosted = ncbi_scrub_se.read1_dehosted - # kraken2 - theiacov and theiapro + # kraken2 - theiacov and theiaprok String kraken_version = select_first([kraken2_raw.version, kraken2_se.kraken2_version, ""]) String kraken_docker = select_first([kraken2_raw.docker, kraken2_se.kraken2_docker, ""]) Float? kraken_human = kraken2_recalculate_abundances_raw.percent_human - Float? kraken_sc2 = kraken2_recalculate_abundances_raw.percent_sc2 + String? kraken_sc2 = kraken2_recalculate_abundances_raw.percent_sc2 String? kraken_target_organism = kraken2_recalculate_abundances_raw.percent_target_organism String? kraken_target_organism_name = kraken2_raw.kraken_target_organism String kraken_report = select_first([kraken2_recalculate_abundances_raw.kraken_report, kraken2_recalculate_abundances.kraken_report, ""]) Float? kraken_human_dehosted = kraken2_recalculate_abundances_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_recalculate_abundances_dehosted.percent_sc2 + String? kraken_sc2_dehosted = kraken2_recalculate_abundances_dehosted.percent_sc2 String? kraken_target_organism_dehosted = kraken2_recalculate_abundances_dehosted.percent_target_organism File? kraken_report_dehosted = kraken2_recalculate_abundances_dehosted.kraken_report String kraken_database = select_first([kraken2_raw.database, kraken2_se.kraken2_database, kraken_db_warning, ""]) diff --git a/workflows/utilities/wf_read_QC_trim_pe.wdl b/workflows/utilities/wf_read_QC_trim_pe.wdl index 0d6090036..ee921bc12 100644 --- a/workflows/utilities/wf_read_QC_trim_pe.wdl +++ b/workflows/utilities/wf_read_QC_trim_pe.wdl @@ -52,14 +52,22 @@ workflow read_QC_trim_pe { samplename = samplename, read1 = read1, read2 = read2, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken.kraken2_theiacov as kraken2_theiacov_dehosted { input: samplename = samplename, read1 = select_first([ncbi_scrub_pe.read1_dehosted]), read2 = ncbi_scrub_pe.read2_dehosted, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } } if (read_processing == "trimmomatic") { @@ -196,11 +204,11 @@ workflow read_QC_trim_pe { # kraken2 - theiacov and theiaprok String kraken_version = select_first([kraken2_theiacov_raw.version, kraken2_standalone.kraken2_version, ""]) Float? kraken_human = kraken2_theiacov_raw.percent_human - Float? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 + String? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 String? kraken_target_organism = kraken2_theiacov_raw.percent_target_organism String kraken_report = select_first([kraken2_theiacov_raw.kraken_report, kraken2_standalone.kraken2_report, ""]) Float? kraken_human_dehosted = kraken2_theiacov_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 + String? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 String? kraken_target_organism_dehosted = kraken2_theiacov_dehosted.percent_target_organism String? kraken_target_organism_name = target_organism File? kraken_report_dehosted = kraken2_theiacov_dehosted.kraken_report diff --git a/workflows/utilities/wf_read_QC_trim_se.wdl b/workflows/utilities/wf_read_QC_trim_se.wdl index af147b512..f82d3aae3 100644 --- a/workflows/utilities/wf_read_QC_trim_se.wdl +++ b/workflows/utilities/wf_read_QC_trim_se.wdl @@ -100,13 +100,21 @@ workflow read_QC_trim_se { input: samplename = samplename, read1 = read1, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } call kraken.kraken2_theiacov as kraken2_theiacov_dehosted { input: samplename = samplename, read1 = select_first([ncbi_scrub_se.read1_dehosted]), - target_organism = target_organism + target_organism = target_organism, + kraken2_db = kraken_db, + disk_size = kraken_disk_size, + memory = kraken_memory, + cpu = kraken_cpu } } if ("~{workflow_series}" == "theiaprok") { @@ -163,11 +171,11 @@ workflow read_QC_trim_se { # kraken2 - raw and dehosted String kraken_version = select_first([kraken2_theiacov_raw.version, kraken2_standalone.kraken2_version, ""]) Float? kraken_human = kraken2_theiacov_raw.percent_human - Float? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 + String? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 String? kraken_target_organism = kraken2_theiacov_raw.percent_target_organism String kraken_report = select_first([kraken2_theiacov_raw.kraken_report, kraken2_standalone.kraken2_report, ""]) Float? kraken_human_dehosted = kraken2_theiacov_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 + String? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 String? kraken_target_organism_dehosted = kraken2_theiacov_dehosted.percent_target_organism String? kraken_target_organism_name = target_organism File? kraken_report_dehosted = kraken2_theiacov_dehosted.kraken_report