ingest: Add rules to run Nextclade on curated data

Using `community/moncla-lab/iav-h5/ha/all-clades` as the default Nextclade dataset since it works across fauna and NCBI data. Subsequent commits will join these rules with the full ingest workflows.
nextstrain · Jun 21, 2024 · b37234e · b37234e
1 parent f864158
commit b37234e
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 0 deletions.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -23,6 +23,7 @@ rule upload_all:
 
 include: "rules/ingest_fauna.smk"
 include: "rules/merge_segment_metadata.smk"
+include: "rules/nextclade.smk"
 include: "rules/upload_to_s3.smk"
 
 # Allow users to import custom rules provided via the config.

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -10,3 +10,6 @@ segments:
 
 s3_dst:
   fauna: s3://nextstrain-data-private/files/workflows/avian-flu
+
+nextclade:
+  dataset_name: "community/moncla-lab/iav-h5/ha/all-clades"
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -0,0 +1,44 @@
+"""
+This part of the workflow handles running Nextclade on the curated metadata
+and sequences.
+"""
+
+
+DATASET_NAME = config["nextclade"]["dataset_name"]
+
+
+rule get_nextclade_dataset:
+    """Download Nextclade dataset"""
+    output:
+        dataset=f"data/nextclade/{DATASET_NAME}.zip",
+    benchmark:
+        "benchmarks/get_nextclade_dataset.txt"
+    params:
+        dataset_name=DATASET_NAME
+    shell:
+        """
+        nextclade3 dataset get \
+            --name={params.dataset_name:q} \
+            --output-zip={output.dataset} \
+            --verbose
+        """
+
+
+rule run_nextclade:
+    input:
+        dataset=f"data/nextclade/{DATASET_NAME}.zip",
+        # The H5NX datasets should only be for the HA segment
+        sequences="{data_source}/results/sequences_ha.fasta",
+    output:
+        nextclade="{data_source}/results/nextclade.tsv",
+        alignment="{data_source}/results/alignment.fasta",
+    benchmark:
+        "{data_source}/benchmarks/run_nextclade.txt"
+    shell:
+        """
+        nextclade3 run \
+            {input.sequences} \
+            --input-dataset {input.dataset} \
+            --output-tsv {output.nextclade} \
+            --output-fasta {output.alignment}
+        """