From a84e1cd4907dc8726ab732c3441920fc906bcdd1 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 31 Oct 2024 15:26:22 +1300 Subject: [PATCH] Simplify metadata modifications This shifts the modification of metadata which is specific to the h5n1-cattle-outbreak/genome/default build to be downstream of the subsetted-metadata file, which is much simpler to reason with. We could consider doing something similar for the `add_h5_clade` rule, however this wouldn't allow us to use that data as a filtering criteria, and it seems plausible we'd want to do that one day. --- Snakefile | 8 +------ rules/cattle-flu.smk | 50 +++++++++++++++++++++----------------------- 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/Snakefile b/Snakefile index 9b076d3..c1b0924 100755 --- a/Snakefile +++ b/Snakefile @@ -161,14 +161,8 @@ def metadata_by_wildcards(wildcards): # H5 builds have extra clade-level metadata added to the metadata TSV. # We may move this to a node-data JSON which would simplify the snakemake logic # a bit -- see - if wildcards.subtype in ("h5n1", "h5nx"): + if wildcards.subtype in ("h5n1", "h5nx", "h5n1-cattle-outbreak"): return "results/{subtype}/metadata-with-clade.tsv" - # cattle-flu.smk will make its own modifications as needed - elif wildcards.subtype=="h5n1-cattle-outbreak": - if wildcards.segment=="genome": - return "results/{subtype}/{segment}/default/metadata-with-clade-and-non-inferred-values.tsv" - else: - return "results/{subtype}/metadata-with-clade.tsv" else: return "results/{subtype}/metadata.tsv", diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk index a9ada15..8f20a90 100644 --- a/rules/cattle-flu.smk +++ b/rules/cattle-flu.smk @@ -83,7 +83,7 @@ rule genome_metadata: sequences = "results/{subtype}/{segment}/{time}/aligned.fasta", metadata = "results/{subtype}/{segment}/{time}/metadata-with-clade-and-non-inferred-values.tsv", output: - metadata = "results/{subtype}/{segment}/{time}/metadata.tsv" + metadata = temp("results/{subtype}/{segment}/{time}/metadata_intermediate.tsv") wildcard_constraints: subtype = 'h5n1-cattle-outbreak', segment = 'genome', @@ -93,29 +93,6 @@ rule genome_metadata: augur filter --metadata {input.metadata} --sequences {input.sequences} --output-metadata {output.metadata} """ -ruleorder: genome_metadata > filter -# Note: I tried to avoid the above ruleorder and instead add a wildcard constraint on the `filter` rule (as it can also produce -# rules.genome_metadata.output.metadata) telling it to use any segment that's _not_ "genome" however I couldn't get this -# working. I thought `segment = "^(?!genome)[^_/]+"` should work but it doesn't. - -rule prune_tree: - input: - tree = "results/{subtype}/{segment}/{time}/tree.nwk", - strains = "auspice/avian-flu_h5n1-cattle-outbreak_genome.json", - output: - tree = "results/{subtype}/{segment}/{time}/tree_outbreak-clade.nwk", - node_data = "results/{subtype}/{segment}/{time}/outbreak-clade-strains-in-genome-tree.json", - wildcard_constraints: - subtype="h5n1-cattle-outbreak", - time="default", - shell: - """ - python3 scripts/restrict-via-common-ancestor.py \ - --tree {input.tree} \ - --strains {input.strains} \ - --output-tree {output.tree} \ - --output-metadata {output.node_data} - """ def assert_expected_config(w): try: @@ -135,9 +112,9 @@ rule add_metadata_columns_to_show_non_inferred_values: that function's not visible to this .smk file so would require deeper refactoring. """ input: - metadata = "results/{subtype}/metadata-with-clade.tsv", + metadata = "results/{subtype}/{segment}/{time}/metadata_intermediate.tsv" output: - metadata = "results/{subtype}/{segment}/{time}/metadata-with-clade-and-non-inferred-values.tsv", + metadata = "results/{subtype}/{segment}/{time}/metadata.tsv" wildcard_constraints: subtype="h5n1-cattle-outbreak", segment="genome", @@ -150,3 +127,24 @@ rule add_metadata_columns_to_show_non_inferred_values: """ cat {input.metadata} | csvtk mutate -t -f {params.old_column} -n {params.new_column} > {output.metadata} """ + +ruleorder: add_metadata_columns_to_show_non_inferred_values > filter + +rule prune_tree: + input: + tree = "results/{subtype}/{segment}/{time}/tree.nwk", + strains = "auspice/avian-flu_h5n1-cattle-outbreak_genome.json", + output: + tree = "results/{subtype}/{segment}/{time}/tree_outbreak-clade.nwk", + node_data = "results/{subtype}/{segment}/{time}/outbreak-clade-strains-in-genome-tree.json", + wildcard_constraints: + subtype="h5n1-cattle-outbreak", + time="default", + shell: + """ + python3 scripts/restrict-via-common-ancestor.py \ + --tree {input.tree} \ + --strains {input.strains} \ + --output-tree {output.tree} \ + --output-metadata {output.node_data} + """