diff --git a/ingest/README.md b/ingest/README.md index ff14a96..1713cea 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -34,6 +34,14 @@ The modules of the workflow are in separate files to keep the main ingest [Snake Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes) in the main Snakefile in the order that they are expected to run. +## Profiles + +The profiles directory contains custom configs and rules that override and/or +extend the default workflow. + +- [nextstrain_automation](profiles/nextstrain_automation/) - profile for the internal automated Nextstrain builds. + + ## Vendored This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) diff --git a/ingest/Snakefile b/ingest/Snakefile index 121fe54..6bf5b5b 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -1,21 +1,44 @@ +""" +This is the main ingest Snakefile that orchestrates the full ingest workflow +and defines its default outputs. +""" + # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" +# This is the default rule that Snakemake will run when there are no specified targets. +# The default output of the ingest workflow is usually the curated metadata and sequences. +# Nextstrain maintained ingest workflows will produce metadata files with the +# standard Nextstrain fields and additional fields that are pathogen specific. +# We recommend use these standard fields in custom ingests as well to minimize +# the customizations you will need for the downstream phylogenetic workflow. +# TODO: Add link to centralized docs on standard Nextstrain metadata fields rule all: input: "results/sequences.fasta", "results/metadata.tsv", +# Note that only PATHOGEN level customizations should be added to these +# core steps, meaning they are custom rules necessary for all builds of the pathogen. +# If there are build specific customizations, they should be added with the +# custom_rules imported below to ensure that the core workflow is not complicated +# by build specific rules. include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" -# If included, the nextclade rules will create the final metadata TSV by -# joining the Nextclade output with the metadata. -# However, if not including nextclade, we have to rename the subset metadata TSV +# We are pushing to standardize ingest workflows with Nextclade runs to include +# Nextclade outputs in our publicly hosted data. However, if a Nextclade dataset +# does not already exist, it requires curated data as input, so we are making +# Nextclade steps optional here. +# +# If Nextclade config values are included, the nextclade rules will create the +# final metadata TSV by joining the Nextclade output with the metadata. +# If Nextclade configs are not included, we rename the subset metadata TSV # to the final metadata TSV. if "nextclade" in config: + include: "rules/nextclade.smk" else: @@ -29,3 +52,18 @@ else: """ mv {input.metadata} {output.metadata} """ + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override the workflow. +# A concrete example of using custom rules is the extension of the workflow with +# rules to support the Nextstrain automation that upload files and send internal +# Slack notifications. +# For extensions, the user will have to specify the custom rule targets when +# running the workflow. +# For overrides, the custom Snakefile will have to use the `ruleorder` directive +# to allow Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/ingest/profiles/nextstrain_automation/defaults.yaml b/ingest/profiles/nextstrain_automation/defaults.yaml new file mode 100644 index 0000000..4288452 --- /dev/null +++ b/ingest/profiles/nextstrain_automation/defaults.yaml @@ -0,0 +1,23 @@ +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run with additional Nextstrain automation rules. + +# Custom rules to run as part of the Nextstrain automated workflow +# The paths should be relative to the ingest directory. +custom_rules: + - profiles/nextstrain_automation/upload.smk + +# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads +# This is required as long as we are using the AWS CLI for uploads +cloudfront_domain: "data.nextstrain.org" + +# Nextstrain AWS S3 Bucket with pathogen prefix +# Replace with the pathogen repo name. +s3_dst: "s3://nextstrain-data/files/workflows/" + +# Mapping of files to upload +files_to_upload: + ncbi.ndjson.zst: data/ncbi.ndjson + metadata.tsv.zst: results/metadata.tsv + sequences.fasta.zst: results/sequences.fasta + alignments.fasta.zst: results/alignment.fasta + translations.zip: results/translations.zip diff --git a/ingest/profiles/nextstrain_automation/upload.smk b/ingest/profiles/nextstrain_automation/upload.smk new file mode 100644 index 0000000..1ecec4b --- /dev/null +++ b/ingest/profiles/nextstrain_automation/upload.smk @@ -0,0 +1,47 @@ +""" +This part of the workflow handles uploading files to AWS S3. + +Files to upload must be defined in the `files_to_upload` config param, where +the keys are the remote files and the values are the local filepaths +relative to the ingest directory. + +Produces a single file for each uploaded file: + "results/upload/{remote_file}.upload" + +The rule `upload_all` can be used as a target to upload all files. +""" +import os + +slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ +send_notifications = ( + config.get("send_slack_notifications", False) and slack_envvars_defined +) + + +rule upload_to_s3: + input: + file_to_upload=config["files_to_upload"][wildcards.remote_file], + output: + "results/upload/{remote_file}.upload", + params: + quiet="" if send_notifications else "--quiet", + s3_dst=config["s3_dst"], + cloudfront_domain=config["cloudfront_domain"], + shell: + """ + ./vendored/upload-to-s3 \ + {params.quiet} \ + {input.file_to_upload:q} \ + {params.s3_dst:q}/{wildcards.remote_file:q} \ + {params.cloudfront_domain} 2>&1 | tee {output} + """ + + +rule upload_all: + input: + uploads=[ + f"results/upload/{remote_file}.upload" + for remote_file in config["files_to_upload"].keys() + ], + output: + touch("results/upload_all.done") diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index b9e2543..6a7168a 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -1,9 +1,15 @@ """ -This part of the workflow handles the curation of metadata for sequences -from NCBI and outputs the clean data as two separate files: +This part of the workflow handles the curation of data from NCBI + +REQUIRED INPUTS: + + ndjson = data/ncbi.ndjson + +OUTPUTS: + + metadata = results/subset_metadata.tsv + seuqences = results/sequences.fasta - - results/subset_metadata.tsv - - results/sequences.fasta """ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 3eec150..8b8c064 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -1,60 +1,38 @@ """ -This part of the workflow handles fetching sequences and metadata from NCBI -and outputs them as a single NDJSON file that can be directly fed into the -curation pipeline. +This part of the workflow handles fetching sequences and metadata from NCBI. -There are two different approaches for fetching data from NCBI. -Choose the one that works best for the pathogen data and remove the rules related -to the other approaches. +REQUIRED INPUTS: -1. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) - - Returns all available data via a GenBank file - - Requires a custom script to parse the necessary fields from the GenBank file - -2. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) - - Directly returns NDJSON without custom parsing - - Fastest option for large datasets (e.g. SARS-CoV-2) - - Only returns metadata fields that are available through NCBI Datasets - - Example is written for viral data, please see offical NCBI Datasets docs for other genomes -""" + None -########################################################################### -########################## 1. Fetch from Entrez ########################### -########################################################################### +OUTPUTS: + ndjson = data/ncbi.ndjson -rule fetch_from_ncbi_entrez: - params: - term=config["entrez_search_term"], - output: - genbank="data/genbank.gb", - # Allow retries in case of network errors - retries: 5 - benchmark: - "benchmarks/fetch_from_ncbi_entrez.txt" - shell: - """ - vendored/fetch-from-ncbi-entrez \ - --term {params.term:q} \ - --output {output.genbank} - """ +There are two different approaches for fetching data from NCBI. +Choose the one that works best for the pathogen data and edit the workflow config +to provide the correct parameter. +1. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) + - requires `ncbi_taxon_id` config + - Directly returns NDJSON without custom parsing + - Fastest option for large datasets (e.g. SARS-CoV-2) + - Only returns metadata fields that are available through NCBI Datasets + - Only works for viral genomes -rule parse_genbank_to_ndjson: - input: - genbank="data/genbank.gb", - output: - ndjson="data/ncbi.ndjson", - benchmark: - "benchmarks/parse_genbank_to_ndjson.txt" - shell: - """ - # Add in custom script to parse needed fields from GenBank file to NDJSON file - """ +2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) + - requires `entrez_search_term` config + - Returns all available data via a GenBank file + - Requires a custom script to parse the necessary fields from the GenBank file +""" +# This ruleorder determines which rule to use to produce the final NCBI NDJSON file. +# The default is set to use NCBI Datasets since it does no require a custom script. +# Switch the rule order if you plan to use Entrez +ruleorder: format_ncbi_datasets_ndjson > parse_genbank_to_ndjson ########################################################################### -####################### 2. Fetch from NCBI Datasets ####################### +####################### 1. Fetch from NCBI Datasets ####################### ########################################################################### @@ -129,7 +107,7 @@ rule format_ncbi_dataset_report: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: fields_to_include=_get_ncbi_dataset_field_mnemonics( - config["ncbi_dataset_fields"] + config.get("ncbi_dataset_fields", []) ), benchmark: "benchmarks/format_ncbi_dataset_report.txt" @@ -167,3 +145,38 @@ rule format_ncbi_datasets_ndjson: --duplicate-reporting warn \ 2> {log} > {output.ndjson} """ + + +########################################################################### +########################## 2. Fetch from Entrez ########################### +########################################################################### + + +rule fetch_from_ncbi_entrez: + params: + term=config["entrez_search_term"], + output: + genbank="data/genbank.gb", + # Allow retries in case of network errors + retries: 5 + benchmark: + "benchmarks/fetch_from_ncbi_entrez.txt" + shell: + """ + vendored/fetch-from-ncbi-entrez \ + --term {params.term:q} \ + --output {output.genbank} + """ + + +rule parse_genbank_to_ndjson: + input: + genbank="data/genbank.gb", + output: + ndjson="data/ncbi.ndjson", + benchmark: + "benchmarks/parse_genbank_to_ndjson.txt" + shell: + """ + # Add in custom script to parse needed fields from GenBank file to NDJSON file + """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 4d69cd7..d29de9a 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -2,9 +2,21 @@ This part of the workflow handles running Nextclade on the curated metadata and sequences. +REQUIRED INPUTS: + + metadata = results/subset_metadata.tsv + sequences = results/sequences.fasta + +OUTPUTS: + + metadata = results/metadata.tsv + nextclade = results/nextclade.tsv + alignment = results/alignment.fasta + translations = results/translations.zip + See Nextclade docs for more details on usage, inputs, and outputs if you would like to customize the rules: -https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html +https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html """ DATASET_NAME = config["nextclade"]["dataset_name"] diff --git a/nextclade/Snakefile b/nextclade/Snakefile index f87d384..0dce54c 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -1,15 +1,46 @@ +""" +This is the main Nextclade Snakefile that orchestrates the workflow to produce +a Nextclade dataset. +""" + # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" - +# This is the default rule that Snakemake will run when there are no specified targets. +# The default output of the Nextclade workflow is usually the produced Nextclade dataset. +# See Nextclade docs on expected naming conventions of dataset files +# https://docs.nextstrain.org/projects/nextclade/page/user/datasets.html rule all: input: - # Fill in path to the final exported Auspice JSON - auspice_json="", + # Fill in paths to the final exported Nextclade dataset. +# These rules are imported in the order that they are expected to run. +# Each Snakefile will have documented inputs and outputs that should be kept as +# consistent interfaces across pathogen repos. This allows us to define typical +# steps that are required for a phylogenetic workflow, but still allow pathogen +# specific customizations within each step. +# Note that only PATHOGEN level customizations should be added to these +# core steps, meaning they are custom rules necessary for all builds of the pathogen. +# If there are build specific customizations, they should be added with the +# custom_rules imported below to ensure that the core workflow is not complicated +# by build specific rules. include: "rules/preprocess.smk" include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override the workflow. +# A concrete example of using custom rules is the extension of the workflow with +# rules to do a test run of `nextclade run` with the produced Nextclade dataset. +# For extensions, the user will have to specify the custom rule targets when +# running the workflow. +# For overrides, the custom Snakefile will have to use the `ruleorder` directive +# to allow Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/nextclade/profiles/test_dataset/defaults.yaml b/nextclade/profiles/test_dataset/defaults.yaml new file mode 100644 index 0000000..ce8a6f4 --- /dev/null +++ b/nextclade/profiles/test_dataset/defaults.yaml @@ -0,0 +1,7 @@ +# This configuration file should contain all the required configuration parameters +# for the Nextclade workflow to do a test run with a created dataset + +# Custom rules to run as part of the testing workflow +# The paths should be relative to the phylogenetic directory. +custom_rules: + - profiles/test_dataset/test_dataset.smk diff --git a/nextclade/profiles/test_dataset/test_dataset.smk b/nextclade/profiles/test_dataset/test_dataset.smk new file mode 100644 index 0000000..b34b22c --- /dev/null +++ b/nextclade/profiles/test_dataset/test_dataset.smk @@ -0,0 +1,19 @@ +rule test_dataset: + input: + tree="datasets/{build_name}/tree.json", + pathogen_json="datasets/{build_name}/pathogen.json", + sequences="datasets/{build_name}/sequences.fasta", + annotation="datasets/{build_name}/genome_annotation.gff3", + readme="datasets/{build_name}/README.md", + changelog="datasets/{build_name}/CHANGELOG.md", + output: + outdir=directory("test_output/{build_name}"), + params: + dataset_dir="datasets/{build_name}", + shell: + """ + nextclade run \ + {input.sequences} \ + --input-dataset {params.dataset_dir} \ + --output-all {output.outdir} + """ diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk index 66951d5..f23a9e9 100644 --- a/nextclade/rules/annotate_phylogeny.smk +++ b/nextclade/rules/annotate_phylogeny.smk @@ -2,9 +2,23 @@ This part of the workflow creates additonal annotations for the reference tree of the Nextclade dataset. -This part of the workflow expects a single Newick tree and any additional files -needed to create the annotations such as the aligned FASTA and lineage/clade -designation files. +REQUIRED INPUTS: -This will produce one or node data JSONs. + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + tree = results/tree.nwk + +OUTPUTS: + + nt_muts = results/nt_muts.json + aa_muts = results/aa_muts.json + clades = results/clades.json + +This part of the workflow usually includes the following steps: + + - augur ancestral + - augur translate + - augur clades + +See Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk index acdf4f6..cae9e7a 100644 --- a/nextclade/rules/construct_phylogeny.smk +++ b/nextclade/rules/construct_phylogeny.smk @@ -1,6 +1,20 @@ """ This part of the workflow constructs the reference tree for the Nextclade dataset -This part of the workflow expects a single aligned FASTA file and -will produce a Newick tree and a branch lengths JSON file. +REQUIRED INPUTS: + + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + +OUTPUTS: + + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + +This part of the workflow usually includes the following steps: + + - augur tree + - augur refine + +See Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk index d8557ef..aec5acb 100644 --- a/nextclade/rules/export.smk +++ b/nextclade/rules/export.smk @@ -1,7 +1,34 @@ """ This part of the workflow collects the phylogenetic tree and annotations to -export a reference tree for the Nextclade dataset. +export a reference tree and create the Nextclade dataset. -This part of the workflow expects a single Newick tree and at least one -node data JSON. +REQUIRED INPUTS: + + TODO: Confirm inputs for Nextclade v3 + reference = ../shared/reference.fasta + pathogen = config/pathogen.json + genome_annotation = config/genome_annotation.gff3 + readme = config/README.md + changelog = config/CHANGELOG.md + metadata = data/metadata.tsv + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + nt_muts = results/nt_muts.json + aa_muts = results/aa_muts.json + clades = results/clades.json + + +OUTPUTS: + + nextclade_dataset = datasets/${build_name}/* + + See Nextclade docs on expected naming conventions of dataset files + https://docs.nextstrain.org/projects/nextclade/page/user/datasets.html + +This part of the workflow usually includes the following steps: + + - augur export v2 + - cp Nextclade datasets files to new datasets directory + +See Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk index f4e4cbd..0f24d55 100644 --- a/nextclade/rules/prepare_sequences.smk +++ b/nextclade/rules/prepare_sequences.smk @@ -2,14 +2,22 @@ This part of the workflow prepares sequences for constructing the reference tree of the Nextclade dataset. -This usually includes the following steps: +REQUIRED INPUTS: - - filtering - - subsampling - - indexing - - aligning - - masking + metadata = data/metadata.tsv + sequences = data/sequences.fasta + reference = ../shared/reference.fasta -This part of the workflow expects a metadata and FASTA file as inputs -and will produce a FASTA file of prepared sequences as an output. +OUTPUTS: + + prepared_sequences = results/prepared_sequences.fasta + +This part of the workflow usually includes the following steps: + + - augur index + - augur filter + - nextclade run + - augur mask + +See Nextclade's and Augur's usage docs for these commands for more details. """ diff --git a/nextclade/rules/preprocess.smk b/nextclade/rules/preprocess.smk index d7bac40..b0dca6c 100644 --- a/nextclade/rules/preprocess.smk +++ b/nextclade/rules/preprocess.smk @@ -1,4 +1,18 @@ """ This part of the workflow preprocesses any data and files related to the lineages/clades designations of the pathogen. + +REQUIRED INPUTS: + + None + +OUTPUTS: + + metadata = data/metadata.tsv + sequences = data/sequences.fasta + + There will be many pathogen specific outputs from this part of the workflow + due to the many ways lineages and/or clades are maintained and defined. + +This part of the workflow usually includes steps to download and curate the required files. """ diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 867d024..b1c7d5a 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -28,5 +28,11 @@ The modules of the workflow are in separate files to keep the main ingest [Snake Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes) in the main Snakefile in the order that they are expected to run. +## Profiles + +The profiles directory contains custom configs and rules that override and/or +extend the default workflow. + +- [ci](profiles/ci/) - profile for the CI build that runs with example data [Nextstrain datasets]: https://docs.nextstrain.org/en/latest/reference/glossary.html#term-dataset diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 5145372..2b33d2f 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,14 +1,49 @@ +""" +This is the main phylogenetic Snakefile that orchestrates the full phylogenetic +workflow and define its default output(s). +""" + # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" +# This is the default rule that Snakemake will run when there are no specified targets. +# The default output of the phylogenetic workflow is usually the final +# Nexstrain dataset(s) or Auspice JSON(s) that is output from `rules/export.smk` +# See Nextstrain docs on expected naming conventions of dataset files +# https://docs.nextstrain.org/page/reference/data-formats.html rule all: input: - # Fill in path to the final exported Auspice JSON + # Fill in path(s) to the final exported Auspice JSON(s) auspice_json="", +# These rules are imported in the order that they are expected to run. +# Each Snakefile will have documented inputs and outputs that should be kept as +# consistent interfaces across pathogen repos. This allows us to define typical +# steps that are required for a phylogenetic workflow, but still allow pathogen +# specific customizations within each step. +# Note that only PATHOGEN level customizations should be added to these +# core steps, meaning they are custom rules necessary for all builds of the pathogen. +# If there are build specific customizations, they should be added with the +# custom_rules imported below to ensure that the core workflow is not complicated +# by build specific rules. include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" + +# Allow users to import custom rules provided via the config. +# This allows users to run custom rules that can extend or override the workflow. +# A concrete example of using custom rules is the extension of the workflow with +# rules to support the Nextstrain automation that upload files and send internal +# Slack notifications. +# For extensions, the user will have to specify the custom rule targets when +# running the workflow. +# For overrides, the custom Snakefile will have to use the `ruleorder` directive +# to allow Snakemake to handle ambiguous rules +# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/profiles/ci/copy_example_data.smk b/phylogenetic/profiles/ci/copy_example_data.smk new file mode 100644 index 0000000..4e47ee4 --- /dev/null +++ b/phylogenetic/profiles/ci/copy_example_data.smk @@ -0,0 +1,17 @@ +rule copy_example_data: + input: + sequences="example_data/sequences.fasta", + metadata="example_data/metadata.tsv", + output: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + shell: + """ + cp -f {input.sequences} {output.sequences} + cp -f {input.metadata} {output.metadata} + """ + +# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules +# that have the same output as the copy_example_data rule. + +# ruleorder: copy_example_data > ... diff --git a/phylogenetic/profiles/ci/defaults.yaml b/phylogenetic/profiles/ci/defaults.yaml new file mode 100644 index 0000000..8542d43 --- /dev/null +++ b/phylogenetic/profiles/ci/defaults.yaml @@ -0,0 +1,7 @@ +# This configuration file contains the custom configurations parameters +# for the CI workflow to run with the example data. + +# Custom rules to run as part of the CI automated workflow +# The paths should be relative to the phylogenetic directory. +custom_rules: + - profiles/ci/copy_example_data.smk diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 9399b01..facc35f 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -1,8 +1,32 @@ """ This part of the workflow creates additonal annotations for the phylogenetic tree. -This part of the workflow expects a single Newick tree and any additional files -needed to create the annotations such as the aligned FASTA and metadata file. +REQUIRED INPUTS: -This will produce one or more node data JSONs. + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + tree = results/tree.nwk + +OUTPUTS: + + node_data = results/*.json + + There are no required outputs for this part of the workflow as it depends + on which annotations are created. All outputs are expected to be node data + JSON files that can be fed into `augur export`. + + See Nextstrain's data format docs for more details on node data JSONs: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur traits + - augur ancestral + - augur translate + - augur clades + +See Augur's usage docs for these commands for more details. + +Custom node data files can also be produced by build-specific scripts in addition +to the ones produced by Augur commands. """ diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 41b0cac..7652005 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -1,9 +1,20 @@ """ This part of the workflow constructs the phylogenetic tree. -This part of the workflow expects a single aligned FASTA file. -If constructing a time-resolved tree, it will also require a metadata file -that includes a sample date for each sequence. +REQUIRED INPUTS: -This will produce a Newick tree and a branch lengths JSON file. + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + +OUTPUTS: + + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + +This part of the workflow usually includes the following steps: + + - augur tree + - augur refine + +See Augur's usage docs for these commands for more details. """ diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index f6d26c0..a273659 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -2,6 +2,25 @@ This part of the workflow collects the phylogenetic tree and annotations to export a Nextstrain dataset. -This part of the workflow expects a single Newick tree and at least one -node data JSON. +REQUIRED INPUTS: + + metadata = data/metadata.tsv + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + node_data = results/*.json + +OUTPUTS: + + auspice_json = auspice/${build_name}.json + + There are optional sidecar JSON files that can be exported as part of the dataset. + See Nextstrain's data format docs for more details on sidecar files: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur export v2 + - augur frequencies + +See Augur's usage docs for these commands for more details. """ diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index d1276b4..c1c9e22 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -1,13 +1,22 @@ """ This part of the workflow prepares sequences for constructing the phylogenetic tree. -This usually includes the following steps: - - filtering - - subsampling - - indexing - - aligning - - masking +REQUIRED INPUTS: -This part of the workflow expects a metadata and FASTA file as inputs -and will produce a FASTA file of prepared sequences as an output. + metadata = data/metadata.tsv + sequences = data/sequences.fasta + reference = ../shared/reference.fasta + +OUTPUTS: + + prepared_sequences = results/prepared_sequences.fasta + +This part of the workflow usually includes the following steps: + + - augur index + - augur filter + - augur align + - augur mask + +See Augur's usage docs for these commands for more details. """ diff --git a/shared/README.md b/shared/README.md new file mode 100644 index 0000000..332320c --- /dev/null +++ b/shared/README.md @@ -0,0 +1,17 @@ +# Shared + +> **Warning** +> Please be aware of the multiple workflows that will be affected when editing files in this directory! + +This is expected to be the directory that holds files that are shared across multiple workflows. + +Instead of cross referencing between workflows when needing to share files, +just have all workflows use this top level directory. This allows us to be +abundantly clear that updating files in this `shared` directory will affect multiple workflows. + +Potential files that could be in this directory: +- `clades.tsv` - clade definitions +- `outliers.txt` - list of outliers to exclude +- `lat_longs.tsv` - location coordinates +- `mask.bed` - specific coordinates to mask in sequences +- `reference.fasta` - reference sequence