From a519284f39498834c821af5dab4492f3f32c5d9d Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 28 Nov 2024 12:31:57 +1300 Subject: [PATCH] WIP: prototype separate workflows as entrypoints See for context. Able to be run via a number of different ways: - From the 'avian-flu' repo: - `snakemake -s gisaid/Snakefile ...` - `cd gisaid && snakemake ...` - `snakemake --configfile gisaid/config.yaml` - From a separate analysis directory, where ${AVIAN_FLU} is the path to the (locally checked out) avian-flu repo - without any config overlays: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile` - with a `config.yaml` overlay: (same as above) - with a `foo.yaml` overlay: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile --configfile foo.yaml` --- Snakefile | 154 +++++++++--------- gisaid/Snakefile | 4 + config/gisaid.yaml => gisaid/config.yaml | 0 h5n1-cattle-outbreak/Snakefile | 4 + .../config.yaml | 4 + rules/cattle-flu.smk | 12 +- 6 files changed, 96 insertions(+), 82 deletions(-) create mode 100644 gisaid/Snakefile rename config/gisaid.yaml => gisaid/config.yaml (100%) create mode 100644 h5n1-cattle-outbreak/Snakefile rename config/h5n1-cattle-outbreak.yaml => h5n1-cattle-outbreak/config.yaml (93%) diff --git a/Snakefile b/Snakefile index d7c987a..ead0573 100755 --- a/Snakefile +++ b/Snakefile @@ -8,81 +8,74 @@ wildcard_constraints: SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"] #SUBTYPES = ["h5n1", "h5nx", "h7n9", "h9n2"] -# ---------------------------------------------------------------------------- -# Allow this to work from a separate workdir by using a config in that workdir -# which extends one of our base configs -# ---------------------------------------------------------------------------- -if os.path.exists("config.yaml"): - configfile: "config.yaml" - # See commentary below - # print("This doesn't work as expected! See commentary in Snakefile", file=sys.stderr) - # exit(2) - -if config.get('extends', False): - extend_path = os.path.join(workflow.basedir, "config", config['extends']) - if not os.path.isfile(extend_path): - sys.exit(f"Your config tried to extend {config['extends']!r} but this doesn't exist. It must be relative to {os.path.join(workflow.basedir, 'config')}") - configfile: extend_path - -# NOTE: -# In the situation where we're running outside of the repo, and we have a custom config YAML -# such as `foo.yaml`: -# extends: h5n1-cattle-outbreak.yaml -# segments: ['pb2'] -# If we run with `--configfile foo.yaml` then the merging behaviour is strange (to me!) -# We've clearly parsed the --configfile, as we have config['extends']="h5n1-cattle-outbreak.yaml", -# and we do merge in all the config values of `h5n1-cattle-outbreak.yaml` (via the above code) -# so I expected we'd therefore have config['segments']=['genome', 'pb2', 'pb1', ...] -# as defined in 'h5n1-cattle-outbreak.yaml', however we end up with only config['segments']=['pb2']. -# So it seems like the `--configfile` definitions are being re-applied a second time?!? -# -# This _is not the case_ when we use the `os.path.exists("config.yaml")` approach, -# which is why it's not going to work without the following additional update_config -# step (or something else?) +CURRENT_BASEDIR = workflow.current_basedir # TODO XXX store this value here - can't access within functions because workflow.included_stack is empty +# Load the base config.yaml relative to the entry snakefile (i.e. not this snakefile) +if os.path.exists(os.path.join(workflow.basedir, 'config.yaml')): + configfile: os.path.join(workflow.basedir, 'config.yaml') + +# load a config.yaml file if it exists in the current working directory if os.path.exists("config.yaml"): - # Following - import yte - with open("config.yaml", encoding='utf-8') as f: - overwrite_config = yte.process_yaml(f, require_use_yte=True) - snakemake.utils.update_config(config, overwrite_config) + configfile: "config.yaml" from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove +class InvalidConfigError(Exception): + pass -def resolve_config_path(original_path, wildcards=None): +def resolve_config_path(path): """ - Resolve a relative *path* given in a configuration value. - Resolves *path* as relative to the workflow's ``config/`` directory (i.e. - ``os.path.join(workflow.basedir, "config", path)``) if it doesn't exist - in the workflow's analysis directory (i.e. the current working - directory, or workdir, usually given by ``--directory`` (``-d``)). - This behaviour allows a default configuration value to point to a default - auxiliary file while also letting the file used be overridden either by - setting an alternate file path in the configuration or by creating a file - with the conventional name in the workflow's analysis directory. + Resolve a relative *path* given in a configuration value. Returns a + function which takes a single argument *wildcards* and returns the resolved + path with any '{x}' substrings are replaced by their corresponding wildcards + filled in + + Search order (first match returned): + 1. Relative to the analysis directory + 2. Relative to the directory the entry snakefile was in. Typically this is + not the Snakefile you are looking at now but (e.g.) the one in + avian-flu/gisaid + 3. Relative to where this Snakefile is (i.e. `avian-flu/`) """ - path = original_path.format(**wildcards) if wildcards else original_path - - if re.search(r'\{.+\}', path): - if wildcards: - print(f"The call to `resolve_config_path({original_path!r}, wildcards)` still includes wildcard-like placeholders afrer resolving: {path!r}.", file=sys.stderr) - else: - print(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.", file=sys.stderr) - exit(2) - - if not os.path.exists(path): - # Check if the path exists relative to the basedir. This catches things like "config/…" - # as well as "clade-labeling/h5n1-clades.tsv" - basepath = os.path.join(workflow.basedir, path) + if not isinstance(path, str): + raise InvalidConfigError(f"Config path provided to resolve_config_path must be a string. Provided value: {str(path)}") + + def resolve(wildcards): + try: + path_expanded = expand(path, **wildcards)[0] + except snakemake.exceptions.WildcardError as e: + # str(e) looks like "No values given for wildcard 'subtypes'." + raise InvalidConfigError(f"resolve_config_path called with path {path!r} however {str(e)}") + + # check if the path exists relative to the working directory + if os.path.exists(path_expanded): + # return an absolute path so that we can use it in different contexts within snakemake + # e.g. snakemake interprets paths in rules as relative to the working directory but + # the "include" directive treats them as relative to the calling snakefile + return os.path.join(os.getcwd(), path_expanded) + + # Check if the path exists relative to the subdir where the entry snakefile is + # (e.g. avian-flu/gisaid). If you want to use further subdirectories (e.g. avian-flu/gisaid/config/x.tsv) + # you're expected to supply the 'config/x.tsv' as the value in the config YAML + # NOTE: this means analysis directory overrides have to use that same 'config/x.tsv' structure, but + # given the different directories avian-flu uses that's acceptable. In other words, if we standardised + # avian-flu then we could add subdirectories to the search order here + basepath = os.path.join(workflow.basedir, path_expanded) if os.path.exists(basepath): return basepath - print(f"Unable to resolve the path {path!r} either within the working directory or within {workflow.basedir!r}", file=sys.stderr) - exit(2) - - return path + # Check if the path exists relative to where _this_ snakefile is, i.e. relative to `avian-flu/`. + if workflow.basedir != CURRENT_BASEDIR: + basepath = os.path.join(CURRENT_BASEDIR, path_expanded) + if os.path.exists(basepath): + return basepath + raise InvalidConfigError(f"Unable to resolve the config-provided path {path!r}, expanded to {path_expanded!r} after filling in wildcards. " + f"The following directories were searched:\n" + f"\t1. {os.path.abspath(os.curdir)} (current working directory)\n" + f"\t2. {workflow.basedir} (where the entry snakefile is)\n" + f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n") + return resolve # The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument) # will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps: @@ -95,6 +88,14 @@ S3_SRC = config.get('s3_src', {}) LOCAL_INGEST = config.get('local_ingest', None) def sanity_check_config(): + if not len(config.keys()): + print("-"*80 + "\nNo config loaded!", file=sys.stderr) + print("Avian-flu is indented to be run from the snakefile inside a subdir " + "(e.g. gisaid/Snakefile) which will pick up the default configfile for that workflow. " + "Alternatively you can pass in the config via `--configfile`", file=sys.stderr) + print("-"*80, file=sys.stderr) + raise InvalidConfigError("No config") + assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'" # NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST # if we want to use `--config local_ingest=gisaid` overrides. @@ -128,6 +129,7 @@ rule all: # This must be after the `all` rule above since it depends on its inputs +# Note this is relative to the `workflow.current_basedir` include: "rules/deploy.smk" rule test_target: @@ -289,11 +291,11 @@ rule add_h5_clade: message: "Adding in a column for h5 clade numbering" input: metadata = "results/{subtype}/metadata.tsv", - clades_file = lambda w: resolve_config_path(files.clades_file, w) + clades_file = resolve_config_path(files.clades_file) output: metadata= "results/{subtype}/metadata-with-clade.tsv" params: - script = os.path.join(workflow.basedir, "clade-labeling/add-clades.py") + script = os.path.join(workflow.current_basedir, "clade-labeling/add-clades.py") shell: r""" python {params.script} \ @@ -349,8 +351,8 @@ rule filter: input: sequences = "results/{subtype}/{segment}/sequences.fasta", metadata = metadata_by_wildcards, - exclude = lambda w: resolve_config_path(files.dropped_strains, w), - include = lambda w: resolve_config_path(files.include_strains, w), + exclude = resolve_config_path(files.dropped_strains), + include = resolve_config_path(files.include_strains), strains = lambda w: f"results/{w.subtype}/ha/{w.time}/filtered.txt" if (SAME_STRAINS and w.segment!='ha') else [], output: sequences = "results/{subtype}/{segment}/{time}/filtered.fasta", @@ -381,7 +383,7 @@ rule align: """ input: sequences = rules.filter.output.sequences, - reference = lambda w: resolve_config_path(files.reference, w) + reference = resolve_config_path(files.reference) output: alignment = "results/{subtype}/{segment}/{time}/aligned.fasta" wildcard_constraints: @@ -482,7 +484,7 @@ def ancestral_root_seq(wildcards): else get_config('ancestral', 'root_seq', wildcards) if not root_seq: return "" - return f"--root-sequence {resolve_config_path(root_seq, wildcards)}" + return f"--root-sequence {resolve_config_path(root_seq)(wildcards)}" rule ancestral: message: "Reconstructing ancestral sequences and mutations" @@ -510,7 +512,7 @@ rule translate: input: tree = refined_tree, node_data = rules.ancestral.output.node_data, - reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference, w) + reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference)(w) output: node_data = "results/{subtype}/{segment}/{time}/aa-muts.json" shell: @@ -570,7 +572,7 @@ rule cleavage_site: cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json", cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json" params: - script = os.path.join(workflow.basedir, "scripts/annotate-ha-cleavage-site.py") + script = os.path.join(workflow.current_basedir, "scripts/annotate-ha-cleavage-site.py") shell: """ python {params.script} \ @@ -613,7 +615,7 @@ rule auspice_config: If we implement config overlays in augur this rule will become unnecessary. """ input: - auspice_config = lambda w: resolve_config_path(files.auspice_config, w), + auspice_config = resolve_config_path(files.auspice_config), output: auspice_config = "results/{subtype}/{segment}/{time}/auspice-config.json", run: @@ -638,7 +640,7 @@ rule auspice_config: rule colors: input: - colors = lambda w: resolve_config_path(files.colors, w), + colors = resolve_config_path(files.colors), output: colors = "results/{subtype}/{segment}/{time}/colors.tsv", shell: @@ -656,9 +658,9 @@ rule export: metadata = rules.filter.output.metadata, node_data = export_node_data_files, colors = "results/{subtype}/{segment}/{time}/colors.tsv", - lat_longs = lambda w: resolve_config_path(files.lat_longs, w), + lat_longs = resolve_config_path(files.lat_longs), auspice_config = rules.auspice_config.output.auspice_config, - description = lambda w: resolve_config_path(files.description, w), + description = resolve_config_path(files.description), output: auspice_json = "results/{subtype}/{segment}/{time}/auspice-dataset.json" params: @@ -728,4 +730,4 @@ rule clean: for rule_file in config.get('custom_rules', []): - include: rule_file + include: resolve_config_path(rule_file)({}) diff --git a/gisaid/Snakefile b/gisaid/Snakefile new file mode 100644 index 0000000..d5db20f --- /dev/null +++ b/gisaid/Snakefile @@ -0,0 +1,4 @@ +include: "../Snakefile" + +rule _all: + input: rules.all.input \ No newline at end of file diff --git a/config/gisaid.yaml b/gisaid/config.yaml similarity index 100% rename from config/gisaid.yaml rename to gisaid/config.yaml diff --git a/h5n1-cattle-outbreak/Snakefile b/h5n1-cattle-outbreak/Snakefile new file mode 100644 index 0000000..d5db20f --- /dev/null +++ b/h5n1-cattle-outbreak/Snakefile @@ -0,0 +1,4 @@ +include: "../Snakefile" + +rule _all: + input: rules.all.input \ No newline at end of file diff --git a/config/h5n1-cattle-outbreak.yaml b/h5n1-cattle-outbreak/config.yaml similarity index 93% rename from config/h5n1-cattle-outbreak.yaml rename to h5n1-cattle-outbreak/config.yaml index ba78421..2432a7f 100644 --- a/config/h5n1-cattle-outbreak.yaml +++ b/h5n1-cattle-outbreak/config.yaml @@ -1,5 +1,8 @@ # NOTE: The h5n1-cattle-outbreak builds use a specific rule-file (specified within this config) # and that rule file may have some config-like parameters within it. +# If you are extending this workflow via a --configfile overlay you and you want to add +# your own custom rule you will need to include "rules/cattle-flu.smk" as an element in the +# list in your overlay, as lists are not merged when configs are combined. custom_rules: - "rules/cattle-flu.smk" @@ -36,6 +39,7 @@ target_sequences_per_tree: 10_000 #### Config files #### + reference: config/h5n1/reference_h5n1_{segment}.gb # use H5N1 references genome_reference: config/{subtype}/h5_cattle_genome_root.gb auspice_config: config/{subtype}/auspice_config_{subtype}.json diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk index a6097af..63ed6fa 100644 --- a/rules/cattle-flu.smk +++ b/rules/cattle-flu.smk @@ -10,8 +10,8 @@ rule filter_segments_for_genome: input: sequences = "results/{subtype}/{genome_seg}/sequences.fasta", metadata = "results/{subtype}/metadata-with-clade.tsv", # TODO: use a function here instead of hardcoding - include = lambda w: resolve_config_path(config['include_strains'], w), - exclude = lambda w: resolve_config_path(config['dropped_strains'], w), + include = resolve_config_path(config['include_strains']), + exclude = resolve_config_path(config['dropped_strains']), output: sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta" params: @@ -40,7 +40,7 @@ rule align_segments_for_genome: sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta", # Use the H5N1 reference sequences for alignment reference = lambda w: [ - resolve_config_path(expanded, w) + resolve_config_path(expanded)(w) for expanded in expand(config['reference'], subtype='h5n1', segment=w.genome_seg) ] @@ -76,7 +76,7 @@ rule join_segments: segment = 'genome', time = 'default', params: - script = os.path.join(workflow.basedir, "scripts/join-segments.py") + script = os.path.join(workflow.current_basedir, "../scripts/join-segments.py") shell: """ python {params.script} \ @@ -147,7 +147,7 @@ rule prune_tree: subtype="h5n1-cattle-outbreak", time="default", params: - script = os.path.join(workflow.basedir, "scripts/restrict-via-common-ancestor.py") + script = os.path.join(workflow.current_basedir, "../scripts/restrict-via-common-ancestor.py") shell: r""" python3 {params.script} \ @@ -170,7 +170,7 @@ rule colors_genome: colors = "results/{subtype}/{segment}/{time}/colors.tsv", params: duplications = "division=division_metadata", - script = os.path.join(workflow.basedir, "scripts/assign-colors.py") + script = os.path.join(workflow.current_basedir, "../scripts/assign-colors.py") wildcard_constraints: subtype="h5n1-cattle-outbreak", time="default",