Skip to content

Commit

Permalink
ingest: Add rules to run Nextclade on curated data
Browse files Browse the repository at this point in the history
Using `community/moncla-lab/iav-h5/ha/all-clades` as the default
Nextclade dataset since it works across fauna and NCBI data.

Subsequent commits will join these rules with the full ingest
workflows.
  • Loading branch information
joverlee521 committed Jun 21, 2024
1 parent f864158 commit b37234e
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 0 deletions.
1 change: 1 addition & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ rule upload_all:

include: "rules/ingest_fauna.smk"
include: "rules/merge_segment_metadata.smk"
include: "rules/nextclade.smk"
include: "rules/upload_to_s3.smk"

# Allow users to import custom rules provided via the config.
Expand Down
3 changes: 3 additions & 0 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ segments:

s3_dst:
fauna: s3://nextstrain-data-private/files/workflows/avian-flu

nextclade:
dataset_name: "community/moncla-lab/iav-h5/ha/all-clades"
44 changes: 44 additions & 0 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences.
"""


DATASET_NAME = config["nextclade"]["dataset_name"]


rule get_nextclade_dataset:
"""Download Nextclade dataset"""
output:
dataset=f"data/nextclade/{DATASET_NAME}.zip",
benchmark:
"benchmarks/get_nextclade_dataset.txt"
params:
dataset_name=DATASET_NAME
shell:
"""
nextclade3 dataset get \
--name={params.dataset_name:q} \
--output-zip={output.dataset} \
--verbose
"""


rule run_nextclade:
input:
dataset=f"data/nextclade/{DATASET_NAME}.zip",
# The H5NX datasets should only be for the HA segment
sequences="{data_source}/results/sequences_ha.fasta",
output:
nextclade="{data_source}/results/nextclade.tsv",
alignment="{data_source}/results/alignment.fasta",
benchmark:
"{data_source}/benchmarks/run_nextclade.txt"
shell:
"""
nextclade3 run \
{input.sequences} \
--input-dataset {input.dataset} \
--output-tsv {output.nextclade} \
--output-fasta {output.alignment}
"""

0 comments on commit b37234e

Please sign in to comment.