Merge pull request #15 from nextstrain/additional-docs

Additional docs
nextstrain · Nov 17, 2023 · 5ac694b · 5ac694b
2 parents 3f1c6e7 + e52f4ac
commit 5ac694b
Show file tree

Hide file tree

Showing 26 changed files with 519 additions and 93 deletions.
diff --git a/ingest/README.md b/ingest/README.md
@@ -34,6 +34,14 @@ The modules of the workflow are in separate files to keep the main ingest [Snake
 Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes)
 in the main Snakefile in the order that they are expected to run.
 
+## Profiles
+
+The profiles directory contains custom configs and rules that override and/or
+extend the default workflow.
+
+- [nextstrain_automation](profiles/nextstrain_automation/) - profile for the internal automated Nextstrain builds.
+
+
 ## Vendored
 
 This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo)

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -1,21 +1,44 @@
+"""
+This is the main ingest Snakefile that orchestrates the full ingest workflow
+and defines its default outputs.
+"""
+
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "config/defaults.yaml"
 
+# This is the default rule that Snakemake will run when there are no specified targets.
+# The default output of the ingest workflow is usually the curated metadata and sequences.
+# Nextstrain maintained ingest workflows will produce metadata files with the
+# standard Nextstrain fields and additional fields that are pathogen specific.
+# We recommend use these standard fields in custom ingests as well to minimize
+# the customizations you will need for the downstream phylogenetic workflow.
+# TODO: Add link to centralized docs on standard Nextstrain metadata fields
 rule all:
     input:
         "results/sequences.fasta",
         "results/metadata.tsv",
 
 
+# Note that only PATHOGEN level customizations should be added to these
+# core steps, meaning they are custom rules necessary for all builds of the pathogen.
+# If there are build specific customizations, they should be added with the
+# custom_rules imported below to ensure that the core workflow is not complicated
+# by build specific rules.
 include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
 
 
-# If included, the nextclade rules will create the final metadata TSV by
-# joining the Nextclade output with the metadata.
-# However, if not including nextclade, we have to rename the subset metadata TSV
+# We are pushing to standardize ingest workflows with Nextclade runs to include
+# Nextclade outputs in our publicly hosted data. However, if a Nextclade dataset
+# does not already exist, it requires curated data as input, so we are making
+# Nextclade steps optional here.
+#
+# If Nextclade config values are included, the nextclade rules will create the
+# final metadata TSV by joining the Nextclade output with the metadata.
+# If Nextclade configs are not included, we rename the subset metadata TSV
 # to the final metadata TSV.
 if "nextclade" in config:
+
     include: "rules/nextclade.smk"
 
 else:
@@ -29,3 +52,18 @@ else:
             """
             mv {input.metadata} {output.metadata}
             """
+
+# Allow users to import custom rules provided via the config.
+# This allows users to run custom rules that can extend or override the workflow.
+# A concrete example of using custom rules is the extension of the workflow with
+# rules to support the Nextstrain automation that upload files and send internal
+# Slack notifications.
+# For extensions, the user will have to specify the custom rule targets when
+# running the workflow.
+# For overrides, the custom Snakefile will have to use the `ruleorder` directive
+# to allow Snakemake to handle ambiguous rules
+# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/ingest/profiles/nextstrain_automation/defaults.yaml b/ingest/profiles/nextstrain_automation/defaults.yaml
@@ -0,0 +1,23 @@
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run with additional Nextstrain automation rules.
+
+# Custom rules to run as part of the Nextstrain automated workflow
+# The paths should be relative to the ingest directory.
+custom_rules:
+  - profiles/nextstrain_automation/upload.smk
+
+# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads
+# This is required as long as we are using the AWS CLI for uploads
+cloudfront_domain: "data.nextstrain.org"
+
+# Nextstrain AWS S3 Bucket with pathogen prefix
+# Replace <pathogen> with the pathogen repo name.
+s3_dst: "s3://nextstrain-data/files/workflows/<pathogen>"
+
+# Mapping of files to upload
+files_to_upload:
+  ncbi.ndjson.zst: data/ncbi.ndjson
+  metadata.tsv.zst: results/metadata.tsv
+  sequences.fasta.zst: results/sequences.fasta
+  alignments.fasta.zst: results/alignment.fasta
+  translations.zip: results/translations.zip
diff --git a/ingest/profiles/nextstrain_automation/upload.smk b/ingest/profiles/nextstrain_automation/upload.smk
@@ -0,0 +1,47 @@
+"""
+This part of the workflow handles uploading files to AWS S3.
+
+Files to upload must be defined in the `files_to_upload` config param, where
+the keys are the remote files and the values are the local filepaths
+relative to the ingest directory.
+
+Produces a single file for each uploaded file:
+    "results/upload/{remote_file}.upload"
+
+The rule `upload_all` can be used as a target to upload all files.
+"""
+import os
+
+slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ
+send_notifications = (
+    config.get("send_slack_notifications", False) and slack_envvars_defined
+)
+
+
+rule upload_to_s3:
+    input:
+        file_to_upload=config["files_to_upload"][wildcards.remote_file],
+    output:
+        "results/upload/{remote_file}.upload",
+    params:
+        quiet="" if send_notifications else "--quiet",
+        s3_dst=config["s3_dst"],
+        cloudfront_domain=config["cloudfront_domain"],
+    shell:
+        """
+        ./vendored/upload-to-s3 \
+            {params.quiet} \
+            {input.file_to_upload:q} \
+            {params.s3_dst:q}/{wildcards.remote_file:q} \
+            {params.cloudfront_domain} 2>&1 | tee {output}
+        """
+
+
+rule upload_all:
+    input:
+        uploads=[
+            f"results/upload/{remote_file}.upload"
+            for remote_file in config["files_to_upload"].keys()
+        ],
+    output:
+        touch("results/upload_all.done")
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -1,9 +1,15 @@
 """
-This part of the workflow handles the curation of metadata for sequences
-from NCBI and outputs the clean data as two separate files:
+This part of the workflow handles the curation of data from NCBI
+
+REQUIRED INPUTS:
+
+    ndjson      = data/ncbi.ndjson
+
+OUTPUTS:
+
+    metadata    = results/subset_metadata.tsv
+    seuqences   = results/sequences.fasta
 
-    - results/subset_metadata.tsv
-    - results/sequences.fasta
 """
 
 

diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
@@ -1,60 +1,38 @@
 """
-This part of the workflow handles fetching sequences and metadata from NCBI
-and outputs them as a single NDJSON file that can be directly fed into the
-curation pipeline.
+This part of the workflow handles fetching sequences and metadata from NCBI.
 
-There are two different approaches for fetching data from NCBI.
-Choose the one that works best for the pathogen data and remove the rules related
-to the other approaches.
+REQUIRED INPUTS:
 
-1. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/)
-    - Returns all available data via a GenBank file
-    - Requires a custom script to parse the necessary fields from the GenBank file
-
-2. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/)
-    - Directly returns NDJSON without custom parsing
-    - Fastest option for large datasets (e.g. SARS-CoV-2)
-    - Only returns metadata fields that are available through NCBI Datasets
-    - Example is written for viral data, please see offical NCBI Datasets docs for other genomes
-"""
+    None
 
-###########################################################################
-########################## 1. Fetch from Entrez ###########################
-###########################################################################
+OUTPUTS:
 
+    ndjson = data/ncbi.ndjson
 
-rule fetch_from_ncbi_entrez:
-    params:
-        term=config["entrez_search_term"],
-    output:
-        genbank="data/genbank.gb",
-    # Allow retries in case of network errors
-    retries: 5
-    benchmark:
-        "benchmarks/fetch_from_ncbi_entrez.txt"
-    shell:
-        """
-        vendored/fetch-from-ncbi-entrez \
-            --term {params.term:q} \
-            --output {output.genbank}
-        """
+There are two different approaches for fetching data from NCBI.
+Choose the one that works best for the pathogen data and edit the workflow config
+to provide the correct parameter.
 
+1. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/)
+    - requires `ncbi_taxon_id` config
+    - Directly returns NDJSON without custom parsing
+    - Fastest option for large datasets (e.g. SARS-CoV-2)
+    - Only returns metadata fields that are available through NCBI Datasets
+    - Only works for viral genomes
 
-rule parse_genbank_to_ndjson:
-    input:
-        genbank="data/genbank.gb",
-    output:
-        ndjson="data/ncbi.ndjson",
-    benchmark:
-        "benchmarks/parse_genbank_to_ndjson.txt"
-    shell:
-        """
-        # Add in custom script to parse needed fields from GenBank file to NDJSON file
-        """
+2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/)
+    - requires `entrez_search_term` config
+    - Returns all available data via a GenBank file
+    - Requires a custom script to parse the necessary fields from the GenBank file
+"""
 
+# This ruleorder determines which rule to use to produce the final NCBI NDJSON file.
+# The default is set to use NCBI Datasets since it does no require a custom script.
+# Switch the rule order if you plan to use Entrez
+ruleorder: format_ncbi_datasets_ndjson > parse_genbank_to_ndjson
 
 ###########################################################################
-####################### 2. Fetch from NCBI Datasets #######################
+####################### 1. Fetch from NCBI Datasets #######################
 ###########################################################################
 
 
@@ -129,7 +107,7 @@ rule format_ncbi_dataset_report:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
         fields_to_include=_get_ncbi_dataset_field_mnemonics(
-            config["ncbi_dataset_fields"]
+            config.get("ncbi_dataset_fields", [])
         ),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
@@ -167,3 +145,38 @@ rule format_ncbi_datasets_ndjson:
             --duplicate-reporting warn \
             2> {log} > {output.ndjson}
         """
+
+
+###########################################################################
+########################## 2. Fetch from Entrez ###########################
+###########################################################################
+
+
+rule fetch_from_ncbi_entrez:
+    params:
+        term=config["entrez_search_term"],
+    output:
+        genbank="data/genbank.gb",
+    # Allow retries in case of network errors
+    retries: 5
+    benchmark:
+        "benchmarks/fetch_from_ncbi_entrez.txt"
+    shell:
+        """
+        vendored/fetch-from-ncbi-entrez \
+            --term {params.term:q} \
+            --output {output.genbank}
+        """
+
+
+rule parse_genbank_to_ndjson:
+    input:
+        genbank="data/genbank.gb",
+    output:
+        ndjson="data/ncbi.ndjson",
+    benchmark:
+        "benchmarks/parse_genbank_to_ndjson.txt"
+    shell:
+        """
+        # Add in custom script to parse needed fields from GenBank file to NDJSON file
+        """
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -2,9 +2,21 @@
 This part of the workflow handles running Nextclade on the curated metadata
 and sequences.
 
+REQUIRED INPUTS:
+
+    metadata    = results/subset_metadata.tsv
+    sequences   = results/sequences.fasta
+
+OUTPUTS:
+
+    metadata        = results/metadata.tsv
+    nextclade       = results/nextclade.tsv
+    alignment       = results/alignment.fasta
+    translations    = results/translations.zip
+
 See Nextclade docs for more details on usage, inputs, and outputs if you would
 like to customize the rules:
-https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html
+https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
 """
 DATASET_NAME = config["nextclade"]["dataset_name"]
 

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
@@ -1,15 +1,46 @@
+"""
+This is the main Nextclade Snakefile that orchestrates the workflow to produce
+a Nextclade dataset.
+"""
+
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "config/defaults.yaml"
 
-
+# This is the default rule that Snakemake will run when there are no specified targets.
+# The default output of the Nextclade workflow is usually the produced Nextclade dataset.
+# See Nextclade docs on expected naming conventions of dataset files
+# https://docs.nextstrain.org/projects/nextclade/page/user/datasets.html
 rule all:
     input:
-        # Fill in path to the final exported Auspice JSON
-        auspice_json="",
+        # Fill in paths to the final exported Nextclade dataset.
 
 
+# These rules are imported in the order that they are expected to run.
+# Each Snakefile will have documented inputs and outputs that should be kept as
+# consistent interfaces across pathogen repos. This allows us to define typical
+# steps that are required for a phylogenetic workflow, but still allow pathogen
+# specific customizations within each step.
+# Note that only PATHOGEN level customizations should be added to these
+# core steps, meaning they are custom rules necessary for all builds of the pathogen.
+# If there are build specific customizations, they should be added with the
+# custom_rules imported below to ensure that the core workflow is not complicated
+# by build specific rules.
 include: "rules/preprocess.smk"
 include: "rules/prepare_sequences.smk"
 include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"
 include: "rules/export.smk"
+
+# Allow users to import custom rules provided via the config.
+# This allows users to run custom rules that can extend or override the workflow.
+# A concrete example of using custom rules is the extension of the workflow with
+# rules to do a test run of `nextclade run` with the produced Nextclade dataset.
+# For extensions, the user will have to specify the custom rule targets when
+# running the workflow.
+# For overrides, the custom Snakefile will have to use the `ruleorder` directive
+# to allow Snakemake to handle ambiguous rules
+# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/nextclade/profiles/test_dataset/defaults.yaml b/nextclade/profiles/test_dataset/defaults.yaml
@@ -0,0 +1,7 @@
+# This configuration file should contain all the required configuration parameters
+# for the Nextclade workflow to do a test run with a created dataset
+
+# Custom rules to run as part of the testing workflow
+# The paths should be relative to the phylogenetic directory.
+custom_rules:
+  - profiles/test_dataset/test_dataset.smk