Skip to content

Commit

Permalink
Simplify metadata modifications
Browse files Browse the repository at this point in the history
This shifts the modification of metadata which is specific to the
h5n1-cattle-outbreak/genome/default build to be downstream of the
subsetted-metadata file, which is much simpler to reason with.

We could consider doing something similar for the `add_h5_clade` rule,
however this wouldn't allow us to use that data as a filtering criteria,
and it seems plausible we'd want to do that one day.
  • Loading branch information
jameshadfield committed Oct 31, 2024
1 parent 94ef772 commit a84e1cd
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 33 deletions.
8 changes: 1 addition & 7 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,8 @@ def metadata_by_wildcards(wildcards):
# H5 builds have extra clade-level metadata added to the metadata TSV.
# We may move this to a node-data JSON which would simplify the snakemake logic
# a bit -- see <https://github.com/nextstrain/avian-flu/issues/25>
if wildcards.subtype in ("h5n1", "h5nx"):
if wildcards.subtype in ("h5n1", "h5nx", "h5n1-cattle-outbreak"):
return "results/{subtype}/metadata-with-clade.tsv"
# cattle-flu.smk will make its own modifications as needed
elif wildcards.subtype=="h5n1-cattle-outbreak":
if wildcards.segment=="genome":
return "results/{subtype}/{segment}/default/metadata-with-clade-and-non-inferred-values.tsv"
else:
return "results/{subtype}/metadata-with-clade.tsv"
else:
return "results/{subtype}/metadata.tsv",

Expand Down
50 changes: 24 additions & 26 deletions rules/cattle-flu.smk
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ rule genome_metadata:
sequences = "results/{subtype}/{segment}/{time}/aligned.fasta",
metadata = "results/{subtype}/{segment}/{time}/metadata-with-clade-and-non-inferred-values.tsv",
output:
metadata = "results/{subtype}/{segment}/{time}/metadata.tsv"
metadata = temp("results/{subtype}/{segment}/{time}/metadata_intermediate.tsv")
wildcard_constraints:
subtype = 'h5n1-cattle-outbreak',
segment = 'genome',
Expand All @@ -93,29 +93,6 @@ rule genome_metadata:
augur filter --metadata {input.metadata} --sequences {input.sequences} --output-metadata {output.metadata}
"""

ruleorder: genome_metadata > filter
# Note: I tried to avoid the above ruleorder and instead add a wildcard constraint on the `filter` rule (as it can also produce
# rules.genome_metadata.output.metadata) telling it to use any segment that's _not_ "genome" however I couldn't get this
# working. I thought `segment = "^(?!genome)[^_/]+"` should work but it doesn't.

rule prune_tree:
input:
tree = "results/{subtype}/{segment}/{time}/tree.nwk",
strains = "auspice/avian-flu_h5n1-cattle-outbreak_genome.json",
output:
tree = "results/{subtype}/{segment}/{time}/tree_outbreak-clade.nwk",
node_data = "results/{subtype}/{segment}/{time}/outbreak-clade-strains-in-genome-tree.json",
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
time="default",
shell:
"""
python3 scripts/restrict-via-common-ancestor.py \
--tree {input.tree} \
--strains {input.strains} \
--output-tree {output.tree} \
--output-metadata {output.node_data}
"""

def assert_expected_config(w):
try:
Expand All @@ -135,9 +112,9 @@ rule add_metadata_columns_to_show_non_inferred_values:
that function's not visible to this .smk file so would require deeper refactoring.
"""
input:
metadata = "results/{subtype}/metadata-with-clade.tsv",
metadata = "results/{subtype}/{segment}/{time}/metadata_intermediate.tsv"
output:
metadata = "results/{subtype}/{segment}/{time}/metadata-with-clade-and-non-inferred-values.tsv",
metadata = "results/{subtype}/{segment}/{time}/metadata.tsv"
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
segment="genome",
Expand All @@ -150,3 +127,24 @@ rule add_metadata_columns_to_show_non_inferred_values:
"""
cat {input.metadata} | csvtk mutate -t -f {params.old_column} -n {params.new_column} > {output.metadata}
"""

ruleorder: add_metadata_columns_to_show_non_inferred_values > filter

rule prune_tree:
input:
tree = "results/{subtype}/{segment}/{time}/tree.nwk",
strains = "auspice/avian-flu_h5n1-cattle-outbreak_genome.json",
output:
tree = "results/{subtype}/{segment}/{time}/tree_outbreak-clade.nwk",
node_data = "results/{subtype}/{segment}/{time}/outbreak-clade-strains-in-genome-tree.json",
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
time="default",
shell:
"""
python3 scripts/restrict-via-common-ancestor.py \
--tree {input.tree} \
--strains {input.strains} \
--output-tree {output.tree} \
--output-metadata {output.node_data}
"""

0 comments on commit a84e1cd

Please sign in to comment.