Move rules for annotating phylogeny to its own smk file

Part of work to update this repo to match the pathogen-repo-template.
nextstrain · Dec 18, 2023 · 2ed2103 · 2ed2103
1 parent a87ad7e
commit 2ed2103
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 61 deletions.
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -19,67 +19,7 @@ files = rules.files.params
 include: "workflow/snakemake_rules/usvi.smk"
 include: "workflow/snakemake_rules/prepare_sequences.smk"
 include: "workflow/snakemake_rules/construct_phylogeny.smk"
-
-rule ancestral:
-    """Reconstructing ancestral sequences and mutations"""
-    input:
-        tree = "results/tree.nwk",
-        alignment = "results/aligned.fasta"
-    output:
-        node_data = "results/nt_muts.json"
-    params:
-        inference = "joint"
-    shell:
-        """
-        augur ancestral \
-            --tree {input.tree} \
-            --alignment {input.alignment} \
-            --output-node-data {output.node_data} \
-            --inference {params.inference}
-        """
-
-rule translate:
-    """Translating amino acid sequences"""
-    input:
-        tree = "results/tree.nwk",
-        node_data = "results/nt_muts.json",
-        reference = files.reference
-    output:
-        node_data = "results/aa_muts.json"
-    shell:
-        """
-        augur translate \
-            --tree {input.tree} \
-            --ancestral-sequences {input.node_data} \
-            --reference-sequence {input.reference} \
-            --output {output.node_data} \
-        """
-
-rule traits:
-    """
-    Inferring ancestral traits for {params.columns!s}
-      - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias
-    """
-    input:
-        tree = "results/tree.nwk",
-        metadata = "data/metadata_all.tsv"
-    output:
-        node_data = "results/traits.json",
-    params:
-        columns = "region country",
-        sampling_bias_correction = 3,
-        strain_id = config.get("strain_id_field", "strain"),
-    shell:
-        """
-        augur traits \
-            --tree {input.tree} \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id} \
-            --output {output.node_data} \
-            --columns {params.columns} \
-            --confidence \
-            --sampling-bias-correction {params.sampling_bias_correction}
-        """
+include: "workflow/snakemake_rules/annotate_phylogeny.smk"
 
 rule export:
     """Exporting data files for for auspice"""

diff --git a/phylogenetic/workflow/snakemake_rules/annotate_phylogeny.smk b/phylogenetic/workflow/snakemake_rules/annotate_phylogeny.smk
@@ -0,0 +1,93 @@
+"""
+This part of the workflow creates additonal annotations for the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    metadata            = data/metadata_all.tsv
+    prepared_sequences  = results/aligned.fasta
+    tree                = results/tree.nwk
+
+OUTPUTS:
+
+    node_data = results/*.json
+
+    There are no required outputs for this part of the workflow as it depends
+    on which annotations are created. All outputs are expected to be node data
+    JSON files that can be fed into `augur export`.
+
+    See Nextstrain's data format docs for more details on node data JSONs:
+    https://docs.nextstrain.org/page/reference/data-formats.html
+
+This part of the workflow usually includes the following steps:
+
+    - augur traits
+    - augur ancestral
+    - augur translate
+    - augur clades
+
+See Augur's usage docs for these commands for more details.
+
+Custom node data files can also be produced by build-specific scripts in addition
+to the ones produced by Augur commands.
+"""
+
+rule ancestral:
+    """Reconstructing ancestral sequences and mutations"""
+    input:
+        tree = "results/tree.nwk",
+        alignment = "results/aligned.fasta"
+    output:
+        node_data = "results/nt_muts.json"
+    params:
+        inference = "joint"
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output-node-data {output.node_data} \
+            --inference {params.inference}
+        """
+
+rule translate:
+    """Translating amino acid sequences"""
+    input:
+        tree = "results/tree.nwk",
+        node_data = "results/nt_muts.json",
+        reference = files.reference
+    output:
+        node_data = "results/aa_muts.json"
+    shell:
+        """
+        augur translate \
+            --tree {input.tree} \
+            --ancestral-sequences {input.node_data} \
+            --reference-sequence {input.reference} \
+            --output {output.node_data} \
+        """
+
+rule traits:
+    """
+    Inferring ancestral traits for {params.columns!s}
+      - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias
+    """
+    input:
+        tree = "results/tree.nwk",
+        metadata = "data/metadata_all.tsv"
+    output:
+        node_data = "results/traits.json",
+    params:
+        columns = "region country",
+        sampling_bias_correction = 3,
+        strain_id = config.get("strain_id_field", "strain"),
+    shell:
+        """
+        augur traits \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output {output.node_data} \
+            --columns {params.columns} \
+            --confidence \
+            --sampling-bias-correction {params.sampling_bias_correction}
+        """