From 848763786e8374c6d54e69a1a0f45fffa70a943a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 17:01:46 -0800 Subject: [PATCH 01/13] ingest/fetch_from_ncbi: Fix optional config param --- ingest/rules/fetch_from_ncbi.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 3eec150..ee1d667 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -129,7 +129,7 @@ rule format_ncbi_dataset_report: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: fields_to_include=_get_ncbi_dataset_field_mnemonics( - config["ncbi_dataset_fields"] + config.get("ncbi_dataset_fields", []) ), benchmark: "benchmarks/format_ncbi_dataset_report.txt" From 8acb4b6a9fd29eb7ec63d55e207cde4fa4542fd9 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 16:56:57 -0800 Subject: [PATCH 02/13] ingest/Snakefile: Add documentation Clarify the set up of the main ingest Snakefile and explain the reason why the Nextclade rules are optional. --- ingest/Snakefile | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/ingest/Snakefile b/ingest/Snakefile index 121fe54..55caeeb 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -1,6 +1,18 @@ +""" +This is the main ingest Snakefile that orchestrates the full ingest workflow +and defines its default outputs. +""" + # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" +# This is the default rule that Snakemake will run when there are no specified targets. +# The default output of the ingest workflow is usually the curated metadata and sequences. +# Nextstrain maintained ingest workflows will produce metadata files with the +# standard Nextstrain fields and additional fields that are pathogen specific. +# We recommend use these standard fields in custom ingests as well to minimize +# the customizations you will need for the downstream phylogenetic workflow. +# TODO: Add link to centralized docs on standard Nextstrain metadata fields rule all: input: "results/sequences.fasta", @@ -11,11 +23,17 @@ include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" -# If included, the nextclade rules will create the final metadata TSV by -# joining the Nextclade output with the metadata. -# However, if not including nextclade, we have to rename the subset metadata TSV +# We are pushing to standardize ingest workflows with Nextclade runs to include +# Nextclade outputs in our publicly hosted data. However, if a Nextclade dataset +# does not already exist, it requires curated data as input, so we are making +# Nextclade steps optional here. +# +# If Nextclade config values are included, the nextclade rules will create the +# final metadata TSV by joining the Nextclade output with the metadata. +# If Nextclade configs are not included, we rename the subset metadata TSV # to the final metadata TSV. if "nextclade" in config: + include: "rules/nextclade.smk" else: From 760bb9cf075104e3fd92304fafe6e3094ff332cc Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 13 Nov 2023 19:01:12 -0800 Subject: [PATCH 03/13] phylogenetic/Snakfile: Add documentation Clarify the set up of the main Snakefile and link to Nextstrain docs for expected data formats of Auspice JSONs. --- phylogenetic/Snakefile | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 5145372..5d45c59 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,13 +1,28 @@ +""" +This is the main phylogenetic Snakefile that orchestrates the full phylogenetic +workflow and define its default output(s). +""" + # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" +# This is the default rule that Snakemake will run when there are no specified targets. +# The default output of the phylogenetic workflow is usually the final +# Nexstrain dataset(s) or Auspice JSON(s) that is output from `rules/export.smk` +# See Nextstrain docs on expected naming conventions of dataset files +# https://docs.nextstrain.org/page/reference/data-formats.html rule all: input: - # Fill in path to the final exported Auspice JSON + # Fill in path(s) to the final exported Auspice JSON(s) auspice_json="", +# These rules are imported in the order that they are expected to run. +# Each Snakefile will have documented inputs and outputs that should be kept as +# consistent interfaces across pathogen repos. This allows us to define typical +# steps that are required for a phylogenetic workflow, but still allow pathogen +# specific customizations within each step. include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" From ad6ace86a0b85de2cc827ba33954f00a75d45558 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 17:46:22 -0800 Subject: [PATCH 04/13] nextclade/Snakefile: Add documentation Clarify teh set up of the main Snakefile and link to Nextclade docs for expected Nextclade datasets files. --- nextclade/Snakefile | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index f87d384..18e3b6d 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -1,13 +1,25 @@ +""" +This is the main Nextclade Snakefile that orchestrates the workflow to produce +a Nextclade dataset. +""" + # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" - +# This is the default rule that Snakemake will run when there are no specified targets. +# The default output of the Nextclade workflow is usually the produced Nextclade dataset. +# See Nextclade docs on expected naming conventions of dataset files +# https://docs.nextstrain.org/projects/nextclade/page/user/datasets.html rule all: input: - # Fill in path to the final exported Auspice JSON - auspice_json="", + # Fill in paths to the final exported Nextclade dataset. +# These rules are imported in the order that they are expected to run. +# Each Snakefile will have documented inputs and outputs that should be kept as +# consistent interfaces across pathogen repos. This allows us to define typical +# steps that are required for a phylogenetic workflow, but still allow pathogen +# specific customizations within each step. include: "rules/preprocess.smk" include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" From 5419cfc6492f2edf486b576106ae89837f3206dd Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 13 Nov 2023 19:02:29 -0800 Subject: [PATCH 05/13] Allow `custom_rules` in all workflows Add a documented way for users to include custom rules for all workflows. Offer the same flexibility across all workflows so it's easy for builds to extend the core workflows with custom rules. --- ingest/Snakefile | 20 ++++++++++++++++++++ nextclade/Snakefile | 19 +++++++++++++++++++ phylogenetic/Snakefile | 20 ++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/ingest/Snakefile b/ingest/Snakefile index 55caeeb..6bf5b5b 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -19,6 +19,11 @@ rule all: "results/metadata.tsv", +# Note that only PATHOGEN level customizations should be added to these +# core steps, meaning they are custom rules necessary for all builds of the pathogen. +# If there are build specific customizations, they should be added with the +# custom_rules imported below to ensure that the core workflow is not complicated +# by build specific rules. include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" @@ -47,3 +52,18 @@ else: """ mv {input.metadata} {output.metadata} """ + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override the workflow. +# A concrete example of using custom rules is the extension of the workflow with +# rules to support the Nextstrain automation that upload files and send internal +# Slack notifications. +# For extensions, the user will have to specify the custom rule targets when +# running the workflow. +# For overrides, the custom Snakefile will have to use the `ruleorder` directive +# to allow Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 18e3b6d..0dce54c 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -20,8 +20,27 @@ rule all: # consistent interfaces across pathogen repos. This allows us to define typical # steps that are required for a phylogenetic workflow, but still allow pathogen # specific customizations within each step. +# Note that only PATHOGEN level customizations should be added to these +# core steps, meaning they are custom rules necessary for all builds of the pathogen. +# If there are build specific customizations, they should be added with the +# custom_rules imported below to ensure that the core workflow is not complicated +# by build specific rules. include: "rules/preprocess.smk" include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override the workflow. +# A concrete example of using custom rules is the extension of the workflow with +# rules to do a test run of `nextclade run` with the produced Nextclade dataset. +# For extensions, the user will have to specify the custom rule targets when +# running the workflow. +# For overrides, the custom Snakefile will have to use the `ruleorder` directive +# to allow Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 5d45c59..2b33d2f 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -23,7 +23,27 @@ rule all: # consistent interfaces across pathogen repos. This allows us to define typical # steps that are required for a phylogenetic workflow, but still allow pathogen # specific customizations within each step. +# Note that only PATHOGEN level customizations should be added to these +# core steps, meaning they are custom rules necessary for all builds of the pathogen. +# If there are build specific customizations, they should be added with the +# custom_rules imported below to ensure that the core workflow is not complicated +# by build specific rules. include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override the workflow. +# A concrete example of using custom rules is the extension of the workflow with +# rules to support the Nextstrain automation that upload files and send internal +# Slack notifications. +# For extensions, the user will have to specify the custom rule targets when +# running the workflow. +# For overrides, the custom Snakefile will have to use the `ruleorder` directive +# to allow Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file From bc2a1489c93f3bd03c8a586cda751988e56a4d86 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 16:07:56 -0800 Subject: [PATCH 06/13] ingest/fetch_from_ncbi: Update docs and add ruleorder Prioritize the NCBI Datasets route in the docs since that is the recommended route. I added the ruleorder directive to make sure that the ambiguous rules do not cause an error. --- ingest/rules/fetch_from_ncbi.smk | 91 +++++++++++++++++--------------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index ee1d667..13eae05 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -4,57 +4,27 @@ and outputs them as a single NDJSON file that can be directly fed into the curation pipeline. There are two different approaches for fetching data from NCBI. -Choose the one that works best for the pathogen data and remove the rules related -to the other approaches. +Choose the one that works best for the pathogen data and edit the rule order at +the top of the file to set the preferred approach. -1. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) - - Returns all available data via a GenBank file - - Requires a custom script to parse the necessary fields from the GenBank file - -2. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) +1. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) - Directly returns NDJSON without custom parsing - Fastest option for large datasets (e.g. SARS-CoV-2) - Only returns metadata fields that are available through NCBI Datasets - - Example is written for viral data, please see offical NCBI Datasets docs for other genomes -""" - -########################################################################### -########################## 1. Fetch from Entrez ########################### -########################################################################### - + - Only works for viral genomes -rule fetch_from_ncbi_entrez: - params: - term=config["entrez_search_term"], - output: - genbank="data/genbank.gb", - # Allow retries in case of network errors - retries: 5 - benchmark: - "benchmarks/fetch_from_ncbi_entrez.txt" - shell: - """ - vendored/fetch-from-ncbi-entrez \ - --term {params.term:q} \ - --output {output.genbank} - """ - - -rule parse_genbank_to_ndjson: - input: - genbank="data/genbank.gb", - output: - ndjson="data/ncbi.ndjson", - benchmark: - "benchmarks/parse_genbank_to_ndjson.txt" - shell: - """ - # Add in custom script to parse needed fields from GenBank file to NDJSON file - """ +2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) + - Returns all available data via a GenBank file + - Requires a custom script to parse the necessary fields from the GenBank file +""" +# This ruleorder determines which rule to use to produce the final NCBI NDJSON file. +# The default is set to use NCBI Datasets since it does no require a custom script. +# Switch the rule order if you plan to use Entrez +ruleorder: format_ncbi_datasets_ndjson > parse_genbank_to_ndjson ########################################################################### -####################### 2. Fetch from NCBI Datasets ####################### +####################### 1. Fetch from NCBI Datasets ####################### ########################################################################### @@ -167,3 +137,38 @@ rule format_ncbi_datasets_ndjson: --duplicate-reporting warn \ 2> {log} > {output.ndjson} """ + + +########################################################################### +########################## 2. Fetch from Entrez ########################### +########################################################################### + + +rule fetch_from_ncbi_entrez: + params: + term=config["entrez_search_term"], + output: + genbank="data/genbank.gb", + # Allow retries in case of network errors + retries: 5 + benchmark: + "benchmarks/fetch_from_ncbi_entrez.txt" + shell: + """ + vendored/fetch-from-ncbi-entrez \ + --term {params.term:q} \ + --output {output.genbank} + """ + + +rule parse_genbank_to_ndjson: + input: + genbank="data/genbank.gb", + output: + ndjson="data/ncbi.ndjson", + benchmark: + "benchmarks/parse_genbank_to_ndjson.txt" + shell: + """ + # Add in custom script to parse needed fields from GenBank file to NDJSON file + """ From 453ff1d311ed2b8394eb0f16dcfcd6d23d3486a0 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 16:59:38 -0800 Subject: [PATCH 07/13] ingest: Update input/output docs for rules Clearly document the inputs and outputs that should be kept as consistent interfaces across pathogen repos. The current rules follow these guidelines, but users may want to add customizations to the workflows that should also adhere to these interfaces. --- ingest/rules/curate.smk | 14 ++++++++++---- ingest/rules/fetch_from_ncbi.smk | 18 +++++++++++++----- ingest/rules/nextclade.smk | 14 +++++++++++++- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index b9e2543..6a7168a 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -1,9 +1,15 @@ """ -This part of the workflow handles the curation of metadata for sequences -from NCBI and outputs the clean data as two separate files: +This part of the workflow handles the curation of data from NCBI + +REQUIRED INPUTS: + + ndjson = data/ncbi.ndjson + +OUTPUTS: + + metadata = results/subset_metadata.tsv + seuqences = results/sequences.fasta - - results/subset_metadata.tsv - - results/sequences.fasta """ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 13eae05..8b8c064 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -1,19 +1,27 @@ """ -This part of the workflow handles fetching sequences and metadata from NCBI -and outputs them as a single NDJSON file that can be directly fed into the -curation pipeline. +This part of the workflow handles fetching sequences and metadata from NCBI. + +REQUIRED INPUTS: + + None + +OUTPUTS: + + ndjson = data/ncbi.ndjson There are two different approaches for fetching data from NCBI. -Choose the one that works best for the pathogen data and edit the rule order at -the top of the file to set the preferred approach. +Choose the one that works best for the pathogen data and edit the workflow config +to provide the correct parameter. 1. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) + - requires `ncbi_taxon_id` config - Directly returns NDJSON without custom parsing - Fastest option for large datasets (e.g. SARS-CoV-2) - Only returns metadata fields that are available through NCBI Datasets - Only works for viral genomes 2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) + - requires `entrez_search_term` config - Returns all available data via a GenBank file - Requires a custom script to parse the necessary fields from the GenBank file """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 4d69cd7..d29de9a 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -2,9 +2,21 @@ This part of the workflow handles running Nextclade on the curated metadata and sequences. +REQUIRED INPUTS: + + metadata = results/subset_metadata.tsv + sequences = results/sequences.fasta + +OUTPUTS: + + metadata = results/metadata.tsv + nextclade = results/nextclade.tsv + alignment = results/alignment.fasta + translations = results/translations.zip + See Nextclade docs for more details on usage, inputs, and outputs if you would like to customize the rules: -https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html +https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html """ DATASET_NAME = config["nextclade"]["dataset_name"] From b8df91a117b496fd7dacfdf5d13392a9f2c6f2eb Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 13:50:34 -0800 Subject: [PATCH 08/13] ingest: Add profile for Nextstrain automation Adds a profile for Nextstrain automation that includes the config and rules required to upload files to AWS S3. I consider this a consolidation of all the different rules for uploading to AWS S3 from the following pathogen workflows: - https://github.com/nextstrain/ncov-ingest/blob/5aed3ed8b8e1bd0a8360f7135f1602df4e01d139/workflow/snakemake_rules/upload.smk - https://github.com/nextstrain/monkeypox/blob/afb5513d9dea10e1993973a154844bf30cdbd335/ingest/workflow/snakemake_rules/upload.smk - https://github.com/nextstrain/seasonal-flu/blob/06865c26a2d972252fde15735d001eaec261126f/profiles/upload/upload.smk The files to upload are provided via the config, where the key is the remote file and value is the local file. This allows us to upload a local file to multiple remotes such as when we need to support multiple compression formats. The S3 URL and CloudFront domain are defined as top level configs to allow for easy overrides via the Snakemake `--config` option. --- ingest/README.md | 8 ++++ .../nextstrain_automation/defaults.yaml | 23 +++++++++ .../profiles/nextstrain_automation/upload.smk | 47 +++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 ingest/profiles/nextstrain_automation/defaults.yaml create mode 100644 ingest/profiles/nextstrain_automation/upload.smk diff --git a/ingest/README.md b/ingest/README.md index ff14a96..1713cea 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -34,6 +34,14 @@ The modules of the workflow are in separate files to keep the main ingest [Snake Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes) in the main Snakefile in the order that they are expected to run. +## Profiles + +The profiles directory contains custom configs and rules that override and/or +extend the default workflow. + +- [nextstrain_automation](profiles/nextstrain_automation/) - profile for the internal automated Nextstrain builds. + + ## Vendored This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) diff --git a/ingest/profiles/nextstrain_automation/defaults.yaml b/ingest/profiles/nextstrain_automation/defaults.yaml new file mode 100644 index 0000000..4288452 --- /dev/null +++ b/ingest/profiles/nextstrain_automation/defaults.yaml @@ -0,0 +1,23 @@ +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run with additional Nextstrain automation rules. + +# Custom rules to run as part of the Nextstrain automated workflow +# The paths should be relative to the ingest directory. +custom_rules: + - profiles/nextstrain_automation/upload.smk + +# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads +# This is required as long as we are using the AWS CLI for uploads +cloudfront_domain: "data.nextstrain.org" + +# Nextstrain AWS S3 Bucket with pathogen prefix +# Replace with the pathogen repo name. +s3_dst: "s3://nextstrain-data/files/workflows/" + +# Mapping of files to upload +files_to_upload: + ncbi.ndjson.zst: data/ncbi.ndjson + metadata.tsv.zst: results/metadata.tsv + sequences.fasta.zst: results/sequences.fasta + alignments.fasta.zst: results/alignment.fasta + translations.zip: results/translations.zip diff --git a/ingest/profiles/nextstrain_automation/upload.smk b/ingest/profiles/nextstrain_automation/upload.smk new file mode 100644 index 0000000..1ecec4b --- /dev/null +++ b/ingest/profiles/nextstrain_automation/upload.smk @@ -0,0 +1,47 @@ +""" +This part of the workflow handles uploading files to AWS S3. + +Files to upload must be defined in the `files_to_upload` config param, where +the keys are the remote files and the values are the local filepaths +relative to the ingest directory. + +Produces a single file for each uploaded file: + "results/upload/{remote_file}.upload" + +The rule `upload_all` can be used as a target to upload all files. +""" +import os + +slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ +send_notifications = ( + config.get("send_slack_notifications", False) and slack_envvars_defined +) + + +rule upload_to_s3: + input: + file_to_upload=config["files_to_upload"][wildcards.remote_file], + output: + "results/upload/{remote_file}.upload", + params: + quiet="" if send_notifications else "--quiet", + s3_dst=config["s3_dst"], + cloudfront_domain=config["cloudfront_domain"], + shell: + """ + ./vendored/upload-to-s3 \ + {params.quiet} \ + {input.file_to_upload:q} \ + {params.s3_dst:q}/{wildcards.remote_file:q} \ + {params.cloudfront_domain} 2>&1 | tee {output} + """ + + +rule upload_all: + input: + uploads=[ + f"results/upload/{remote_file}.upload" + for remote_file in config["files_to_upload"].keys() + ], + output: + touch("results/upload_all.done") From 5e9d49ec00b64876eb64776c9278f13a5b481e5d Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 9 Nov 2023 14:51:13 -0800 Subject: [PATCH 09/13] Add top level `shared` directory See included shared/README.md for use and reasoning of this directory. --- shared/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 shared/README.md diff --git a/shared/README.md b/shared/README.md new file mode 100644 index 0000000..332320c --- /dev/null +++ b/shared/README.md @@ -0,0 +1,17 @@ +# Shared + +> **Warning** +> Please be aware of the multiple workflows that will be affected when editing files in this directory! + +This is expected to be the directory that holds files that are shared across multiple workflows. + +Instead of cross referencing between workflows when needing to share files, +just have all workflows use this top level directory. This allows us to be +abundantly clear that updating files in this `shared` directory will affect multiple workflows. + +Potential files that could be in this directory: +- `clades.tsv` - clade definitions +- `outliers.txt` - list of outliers to exclude +- `lat_longs.tsv` - location coordinates +- `mask.bed` - specific coordinates to mask in sequences +- `reference.fasta` - reference sequence From aa0b2cacbf648165c35f65c0e200682c6c40af49 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 15 Nov 2023 16:58:31 -0800 Subject: [PATCH 10/13] phylogenetic: Add input/output docs to rules Document the inputs and outputs that should be kept as consistent interfaces across pathogen repos. This acts as a guide for the typical steps that are required for the phylogenetic workflow, but still allow pathogen specific customizations within each step. --- phylogenetic/rules/annotate_phylogeny.smk | 30 +++++++++++++++++++--- phylogenetic/rules/construct_phylogeny.smk | 19 +++++++++++--- phylogenetic/rules/export.smk | 23 +++++++++++++++-- phylogenetic/rules/prepare_sequences.smk | 25 ++++++++++++------ 4 files changed, 80 insertions(+), 17 deletions(-) diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 9399b01..facc35f 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -1,8 +1,32 @@ """ This part of the workflow creates additonal annotations for the phylogenetic tree. -This part of the workflow expects a single Newick tree and any additional files -needed to create the annotations such as the aligned FASTA and metadata file. +REQUIRED INPUTS: -This will produce one or more node data JSONs. + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + tree = results/tree.nwk + +OUTPUTS: + + node_data = results/*.json + + There are no required outputs for this part of the workflow as it depends + on which annotations are created. All outputs are expected to be node data + JSON files that can be fed into `augur export`. + + See Nextstrain's data format docs for more details on node data JSONs: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur traits + - augur ancestral + - augur translate + - augur clades + +See Augur's usage docs for these commands for more details. + +Custom node data files can also be produced by build-specific scripts in addition +to the ones produced by Augur commands. """ diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 41b0cac..7652005 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -1,9 +1,20 @@ """ This part of the workflow constructs the phylogenetic tree. -This part of the workflow expects a single aligned FASTA file. -If constructing a time-resolved tree, it will also require a metadata file -that includes a sample date for each sequence. +REQUIRED INPUTS: -This will produce a Newick tree and a branch lengths JSON file. + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + +OUTPUTS: + + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + +This part of the workflow usually includes the following steps: + + - augur tree + - augur refine + +See Augur's usage docs for these commands for more details. """ diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index f6d26c0..a273659 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -2,6 +2,25 @@ This part of the workflow collects the phylogenetic tree and annotations to export a Nextstrain dataset. -This part of the workflow expects a single Newick tree and at least one -node data JSON. +REQUIRED INPUTS: + + metadata = data/metadata.tsv + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + node_data = results/*.json + +OUTPUTS: + + auspice_json = auspice/${build_name}.json + + There are optional sidecar JSON files that can be exported as part of the dataset. + See Nextstrain's data format docs for more details on sidecar files: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur export v2 + - augur frequencies + +See Augur's usage docs for these commands for more details. """ diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index d1276b4..c1c9e22 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -1,13 +1,22 @@ """ This part of the workflow prepares sequences for constructing the phylogenetic tree. -This usually includes the following steps: - - filtering - - subsampling - - indexing - - aligning - - masking +REQUIRED INPUTS: -This part of the workflow expects a metadata and FASTA file as inputs -and will produce a FASTA file of prepared sequences as an output. + metadata = data/metadata.tsv + sequences = data/sequences.fasta + reference = ../shared/reference.fasta + +OUTPUTS: + + prepared_sequences = results/prepared_sequences.fasta + +This part of the workflow usually includes the following steps: + + - augur index + - augur filter + - augur align + - augur mask + +See Augur's usage docs for these commands for more details. """ From ae7e1b141184d37ecf88a5a60f277cf3ae4c1533 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 15:05:06 -0800 Subject: [PATCH 11/13] phylogenetic: Add profile for CI build Adds a profile for the CI build that starts with the example data that is hosted in the pathogen repo. This is useful for both the CI test via GitHub Actions and an easy way for users to run a small example build. --- phylogenetic/README.md | 6 ++++++ phylogenetic/example_data/metadata.tsv | 0 phylogenetic/example_data/sequences.fasta | 0 phylogenetic/profiles/ci/copy_example_data.smk | 17 +++++++++++++++++ phylogenetic/profiles/ci/defaults.yaml | 7 +++++++ 5 files changed, 30 insertions(+) create mode 100644 phylogenetic/example_data/metadata.tsv create mode 100644 phylogenetic/example_data/sequences.fasta create mode 100644 phylogenetic/profiles/ci/copy_example_data.smk create mode 100644 phylogenetic/profiles/ci/defaults.yaml diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 867d024..b1c7d5a 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -28,5 +28,11 @@ The modules of the workflow are in separate files to keep the main ingest [Snake Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes) in the main Snakefile in the order that they are expected to run. +## Profiles + +The profiles directory contains custom configs and rules that override and/or +extend the default workflow. + +- [ci](profiles/ci/) - profile for the CI build that runs with example data [Nextstrain datasets]: https://docs.nextstrain.org/en/latest/reference/glossary.html#term-dataset diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/profiles/ci/copy_example_data.smk b/phylogenetic/profiles/ci/copy_example_data.smk new file mode 100644 index 0000000..4e47ee4 --- /dev/null +++ b/phylogenetic/profiles/ci/copy_example_data.smk @@ -0,0 +1,17 @@ +rule copy_example_data: + input: + sequences="example_data/sequences.fasta", + metadata="example_data/metadata.tsv", + output: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + shell: + """ + cp -f {input.sequences} {output.sequences} + cp -f {input.metadata} {output.metadata} + """ + +# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules +# that have the same output as the copy_example_data rule. + +# ruleorder: copy_example_data > ... diff --git a/phylogenetic/profiles/ci/defaults.yaml b/phylogenetic/profiles/ci/defaults.yaml new file mode 100644 index 0000000..8542d43 --- /dev/null +++ b/phylogenetic/profiles/ci/defaults.yaml @@ -0,0 +1,7 @@ +# This configuration file contains the custom configurations parameters +# for the CI workflow to run with the example data. + +# Custom rules to run as part of the CI automated workflow +# The paths should be relative to the phylogenetic directory. +custom_rules: + - profiles/ci/copy_example_data.smk From 8476cd832ab7cbb0edd2e97385aecf25d2dccd99 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 18:40:09 -0800 Subject: [PATCH 12/13] nextclade: Add input/output docs to rules Document the inputs and outputs that should be kept as consistent interfaces across pathogen repos. This acts a a guid for the typical steps that are required for creating a Nextclade dataset, but still allow pathogen specific customizations. --- nextclade/rules/annotate_phylogeny.smk | 22 ++++++++++++++--- nextclade/rules/construct_phylogeny.smk | 18 ++++++++++++-- nextclade/rules/export.smk | 33 ++++++++++++++++++++++--- nextclade/rules/prepare_sequences.smk | 24 ++++++++++++------ nextclade/rules/preprocess.smk | 14 +++++++++++ 5 files changed, 94 insertions(+), 17 deletions(-) diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk index 66951d5..f23a9e9 100644 --- a/nextclade/rules/annotate_phylogeny.smk +++ b/nextclade/rules/annotate_phylogeny.smk @@ -2,9 +2,23 @@ This part of the workflow creates additonal annotations for the reference tree of the Nextclade dataset. -This part of the workflow expects a single Newick tree and any additional files -needed to create the annotations such as the aligned FASTA and lineage/clade -designation files. +REQUIRED INPUTS: -This will produce one or node data JSONs. + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + tree = results/tree.nwk + +OUTPUTS: + + nt_muts = results/nt_muts.json + aa_muts = results/aa_muts.json + clades = results/clades.json + +This part of the workflow usually includes the following steps: + + - augur ancestral + - augur translate + - augur clades + +See Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk index acdf4f6..cae9e7a 100644 --- a/nextclade/rules/construct_phylogeny.smk +++ b/nextclade/rules/construct_phylogeny.smk @@ -1,6 +1,20 @@ """ This part of the workflow constructs the reference tree for the Nextclade dataset -This part of the workflow expects a single aligned FASTA file and -will produce a Newick tree and a branch lengths JSON file. +REQUIRED INPUTS: + + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + +OUTPUTS: + + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + +This part of the workflow usually includes the following steps: + + - augur tree + - augur refine + +See Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk index d8557ef..aec5acb 100644 --- a/nextclade/rules/export.smk +++ b/nextclade/rules/export.smk @@ -1,7 +1,34 @@ """ This part of the workflow collects the phylogenetic tree and annotations to -export a reference tree for the Nextclade dataset. +export a reference tree and create the Nextclade dataset. -This part of the workflow expects a single Newick tree and at least one -node data JSON. +REQUIRED INPUTS: + + TODO: Confirm inputs for Nextclade v3 + reference = ../shared/reference.fasta + pathogen = config/pathogen.json + genome_annotation = config/genome_annotation.gff3 + readme = config/README.md + changelog = config/CHANGELOG.md + metadata = data/metadata.tsv + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + nt_muts = results/nt_muts.json + aa_muts = results/aa_muts.json + clades = results/clades.json + + +OUTPUTS: + + nextclade_dataset = datasets/${build_name}/* + + See Nextclade docs on expected naming conventions of dataset files + https://docs.nextstrain.org/projects/nextclade/page/user/datasets.html + +This part of the workflow usually includes the following steps: + + - augur export v2 + - cp Nextclade datasets files to new datasets directory + +See Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk index f4e4cbd..0f24d55 100644 --- a/nextclade/rules/prepare_sequences.smk +++ b/nextclade/rules/prepare_sequences.smk @@ -2,14 +2,22 @@ This part of the workflow prepares sequences for constructing the reference tree of the Nextclade dataset. -This usually includes the following steps: +REQUIRED INPUTS: - - filtering - - subsampling - - indexing - - aligning - - masking + metadata = data/metadata.tsv + sequences = data/sequences.fasta + reference = ../shared/reference.fasta -This part of the workflow expects a metadata and FASTA file as inputs -and will produce a FASTA file of prepared sequences as an output. +OUTPUTS: + + prepared_sequences = results/prepared_sequences.fasta + +This part of the workflow usually includes the following steps: + + - augur index + - augur filter + - nextclade run + - augur mask + +See Nextclade's and Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/preprocess.smk b/nextclade/rules/preprocess.smk index d7bac40..b0dca6c 100644 --- a/nextclade/rules/preprocess.smk +++ b/nextclade/rules/preprocess.smk @@ -1,4 +1,18 @@ """ This part of the workflow preprocesses any data and files related to the lineages/clades designations of the pathogen. + +REQUIRED INPUTS: + + None + +OUTPUTS: + + metadata = data/metadata.tsv + sequences = data/sequences.fasta + + There will be many pathogen specific outputs from this part of the workflow + due to the many ways lineages and/or clades are maintained and defined. + +This part of the workflow usually includes steps to download and curate the required files. """ From e52f4ac888531a0363b6a30f9873ec508626d139 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 16 Nov 2023 18:46:54 -0800 Subject: [PATCH 13/13] nextclade: Add profile for testing dataset Adds a profile for testing a dataset with `nextclade run`. --- nextclade/profiles/test_dataset/defaults.yaml | 7 +++++++ .../profiles/test_dataset/test_dataset.smk | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 nextclade/profiles/test_dataset/defaults.yaml create mode 100644 nextclade/profiles/test_dataset/test_dataset.smk diff --git a/nextclade/profiles/test_dataset/defaults.yaml b/nextclade/profiles/test_dataset/defaults.yaml new file mode 100644 index 0000000..ce8a6f4 --- /dev/null +++ b/nextclade/profiles/test_dataset/defaults.yaml @@ -0,0 +1,7 @@ +# This configuration file should contain all the required configuration parameters +# for the Nextclade workflow to do a test run with a created dataset + +# Custom rules to run as part of the testing workflow +# The paths should be relative to the phylogenetic directory. +custom_rules: + - profiles/test_dataset/test_dataset.smk diff --git a/nextclade/profiles/test_dataset/test_dataset.smk b/nextclade/profiles/test_dataset/test_dataset.smk new file mode 100644 index 0000000..b34b22c --- /dev/null +++ b/nextclade/profiles/test_dataset/test_dataset.smk @@ -0,0 +1,19 @@ +rule test_dataset: + input: + tree="datasets/{build_name}/tree.json", + pathogen_json="datasets/{build_name}/pathogen.json", + sequences="datasets/{build_name}/sequences.fasta", + annotation="datasets/{build_name}/genome_annotation.gff3", + readme="datasets/{build_name}/README.md", + changelog="datasets/{build_name}/CHANGELOG.md", + output: + outdir=directory("test_output/{build_name}"), + params: + dataset_dir="datasets/{build_name}", + shell: + """ + nextclade run \ + {input.sequences} \ + --input-dataset {params.dataset_dir} \ + --output-all {output.outdir} + """