WIP: prototype separate workflows as entrypoints

See <https://bedfordlab.slack.com/archives/C01LCTT7JNN/p1732568407123369> for context. Able to be run via a number of different ways: - From the 'avian-flu' repo: - `snakemake -s gisaid/Snakefile ...` - `cd gisaid && snakemake ...` - `snakemake --configfile gisaid/config.yaml` - From a separate analysis directory, where ${AVIAN_FLU} is the path to the (locally checked out) avian-flu repo - without any config overlays: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile` - with a `config.yaml` overlay: (same as above) - with a `foo.yaml` overlay: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile --configfile foo.yaml`
nextstrain · Dec 8, 2024 · a519284 · a519284
1 parent 96a0ded
commit a519284
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 82 deletions.
diff --git a/Snakefile b/Snakefile
@@ -8,81 +8,74 @@ wildcard_constraints:
 SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
 #SUBTYPES = ["h5n1", "h5nx", "h7n9", "h9n2"]
 
-# ----------------------------------------------------------------------------
-# Allow this to work from a separate workdir by using a config in that workdir
-# which extends one of our base configs
-# ----------------------------------------------------------------------------
-if os.path.exists("config.yaml"):
-    configfile: "config.yaml"
-    # See commentary below
-    # print("This doesn't work as expected! See commentary in Snakefile", file=sys.stderr)
-    # exit(2)
-
-if config.get('extends', False):
-    extend_path = os.path.join(workflow.basedir, "config", config['extends'])
-    if not os.path.isfile(extend_path):
-        sys.exit(f"Your config tried to extend {config['extends']!r} but this doesn't exist. It must be relative to {os.path.join(workflow.basedir, 'config')}")
-    configfile: extend_path
-
-# NOTE:
-# In the situation where we're running outside of the repo, and we have a custom config YAML
-# such as `foo.yaml`:
-#        extends: h5n1-cattle-outbreak.yaml
-#        segments: ['pb2']
-# If we run with `--configfile foo.yaml` then the merging behaviour is strange (to me!)
-# We've clearly parsed the --configfile, as we have config['extends']="h5n1-cattle-outbreak.yaml",
-# and we do merge in all the config values of `h5n1-cattle-outbreak.yaml` (via the above code)
-# so I expected we'd therefore have config['segments']=['genome', 'pb2', 'pb1', ...]
-# as defined in 'h5n1-cattle-outbreak.yaml', however we end up with only config['segments']=['pb2'].
-# So it seems like the `--configfile` definitions are being re-applied a second time?!?
-#
-# This _is not the case_ when we use the `os.path.exists("config.yaml")` approach,
-# which is why it's not going to work without the following additional update_config
-# step (or something else?)
+CURRENT_BASEDIR = workflow.current_basedir # TODO XXX store this value here - can't access within functions because workflow.included_stack is empty
 
+# Load the base config.yaml relative to the entry snakefile (i.e. not this snakefile)
+if os.path.exists(os.path.join(workflow.basedir, 'config.yaml')):
+    configfile: os.path.join(workflow.basedir, 'config.yaml')
+
+# load a config.yaml file if it exists in the current working directory
 if os.path.exists("config.yaml"):
-    # Following <https://github.com/snakemake/snakemake/blob/76d53290a003891c5ee41f81e8eb4821c406255d/snakemake/common/configfile.py#L7-L33>
-    import yte
-    with open("config.yaml", encoding='utf-8') as f:
-        overwrite_config = yte.process_yaml(f, require_use_yte=True)
-    snakemake.utils.update_config(config, overwrite_config)
+    configfile: "config.yaml"
 
 from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove
 
+class InvalidConfigError(Exception):
+    pass
 
-def resolve_config_path(original_path, wildcards=None):
+def resolve_config_path(path):
     """
-    Resolve a relative *path* given in a configuration value.
-    Resolves *path* as relative to the workflow's ``config/`` directory (i.e.
-    ``os.path.join(workflow.basedir, "config", path)``) if it doesn't exist
-    in the workflow's analysis directory (i.e. the current working
-    directory, or workdir, usually given by ``--directory`` (``-d``)).
-    This behaviour allows a default configuration value to point to a default
-    auxiliary file while also letting the file used be overridden either by
-    setting an alternate file path in the configuration or by creating a file
-    with the conventional name in the workflow's analysis directory.
+    Resolve a relative *path* given in a configuration value. Returns a
+    function which takes a single argument *wildcards* and returns the resolved
+    path with any '{x}' substrings are replaced by their corresponding wildcards
+    filled in
+
+    Search order (first match returned):
+    1. Relative to the analysis directory
+    2. Relative to the directory the entry snakefile was in. Typically this is
+       not the Snakefile you are looking at now but (e.g.) the one in
+       avian-flu/gisaid
+    3. Relative to where this Snakefile is (i.e. `avian-flu/`)
     """
-    path = original_path.format(**wildcards) if wildcards else original_path
-
-    if re.search(r'\{.+\}', path):
-        if wildcards:
-            print(f"The call to `resolve_config_path({original_path!r}, wildcards)` still includes wildcard-like placeholders afrer resolving: {path!r}.", file=sys.stderr)
-        else:
-            print(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.", file=sys.stderr)
-        exit(2)
-
-    if not os.path.exists(path):
-        # Check if the path exists relative to the basedir. This catches things like "config/…"
-        # as well as "clade-labeling/h5n1-clades.tsv"
-        basepath = os.path.join(workflow.basedir, path)
+    if not isinstance(path, str):
+        raise InvalidConfigError(f"Config path provided to resolve_config_path must be a string. Provided value: {str(path)}")
+
+    def resolve(wildcards):
+        try:
+            path_expanded = expand(path, **wildcards)[0]
+        except snakemake.exceptions.WildcardError as e:
+            # str(e) looks like "No values given for wildcard 'subtypes'."
+            raise InvalidConfigError(f"resolve_config_path called with path {path!r} however {str(e)}")
+
+        # check if the path exists relative to the working directory
+        if os.path.exists(path_expanded):
+            # return an absolute path so that we can use it in different contexts within snakemake
+            # e.g. snakemake interprets paths in rules as relative to the working directory but
+            # the "include" directive treats them as relative to the calling snakefile
+            return os.path.join(os.getcwd(), path_expanded)
+
+        # Check if the path exists relative to the subdir where the entry snakefile is
+        # (e.g. avian-flu/gisaid). If you want to use further subdirectories (e.g. avian-flu/gisaid/config/x.tsv)
+        # you're expected to supply the 'config/x.tsv' as the value in the config YAML
+        # NOTE: this means analysis directory overrides have to use that same 'config/x.tsv' structure, but
+        # given the different directories avian-flu uses that's acceptable. In other words, if we standardised
+        # avian-flu then we could add subdirectories to the search order here
+        basepath = os.path.join(workflow.basedir, path_expanded)
         if os.path.exists(basepath):
             return basepath
 
-        print(f"Unable to resolve the path {path!r} either within the working directory or within {workflow.basedir!r}", file=sys.stderr)
-        exit(2)
-
-    return path
+        # Check if the path exists relative to where _this_ snakefile is, i.e. relative to `avian-flu/`.
+        if workflow.basedir != CURRENT_BASEDIR:
+            basepath = os.path.join(CURRENT_BASEDIR, path_expanded)
+            if os.path.exists(basepath):
+                return basepath
 
+        raise InvalidConfigError(f"Unable to resolve the config-provided path {path!r}, expanded to {path_expanded!r} after filling in wildcards. "
+            f"The following directories were searched:\n"
+            f"\t1. {os.path.abspath(os.curdir)} (current working directory)\n"
+            f"\t2. {workflow.basedir} (where the entry snakefile is)\n"
+            f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n")
+    return resolve
 
 # The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
 # will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps:
@@ -95,6 +88,14 @@ S3_SRC = config.get('s3_src', {})
 LOCAL_INGEST = config.get('local_ingest', None)
 
 def sanity_check_config():
+    if not len(config.keys()):
+        print("-"*80 + "\nNo config loaded!", file=sys.stderr)
+        print("Avian-flu is indented to be run from the snakefile inside a subdir " 
+            "(e.g. gisaid/Snakefile) which will pick up the default configfile for that workflow. " 
+            "Alternatively you can pass in the config via `--configfile`", file=sys.stderr)
+        print("-"*80, file=sys.stderr)
+        raise InvalidConfigError("No config")
+
     assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'"
     # NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST
     # if we want to use `--config local_ingest=gisaid` overrides.
@@ -128,6 +129,7 @@ rule all:
 
 
 # This must be after the `all` rule above since it depends on its inputs
+# Note this is relative to the `workflow.current_basedir`
 include: "rules/deploy.smk"
 
 rule test_target:
@@ -289,11 +291,11 @@ rule add_h5_clade:
     message: "Adding in a column for h5 clade numbering"
     input:
         metadata = "results/{subtype}/metadata.tsv",
-        clades_file = lambda w: resolve_config_path(files.clades_file, w)
+        clades_file = resolve_config_path(files.clades_file)
     output:
         metadata= "results/{subtype}/metadata-with-clade.tsv"
     params:
-        script = os.path.join(workflow.basedir, "clade-labeling/add-clades.py")
+        script = os.path.join(workflow.current_basedir, "clade-labeling/add-clades.py")
     shell:
         r"""
         python {params.script} \
@@ -349,8 +351,8 @@ rule filter:
     input:
         sequences = "results/{subtype}/{segment}/sequences.fasta",
         metadata = metadata_by_wildcards,
-        exclude = lambda w: resolve_config_path(files.dropped_strains, w),
-        include = lambda w: resolve_config_path(files.include_strains, w),
+        exclude = resolve_config_path(files.dropped_strains),
+        include = resolve_config_path(files.include_strains),
         strains = lambda w: f"results/{w.subtype}/ha/{w.time}/filtered.txt" if (SAME_STRAINS and w.segment!='ha') else [],
     output:
         sequences = "results/{subtype}/{segment}/{time}/filtered.fasta",
@@ -381,7 +383,7 @@ rule align:
         """
     input:
         sequences = rules.filter.output.sequences,
-        reference = lambda w: resolve_config_path(files.reference, w)
+        reference = resolve_config_path(files.reference)
     output:
         alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
     wildcard_constraints:
@@ -482,7 +484,7 @@ def ancestral_root_seq(wildcards):
         else get_config('ancestral', 'root_seq', wildcards)
     if not root_seq:
         return ""
-    return f"--root-sequence {resolve_config_path(root_seq, wildcards)}"
+    return f"--root-sequence {resolve_config_path(root_seq)(wildcards)}"
 
 rule ancestral:
     message: "Reconstructing ancestral sequences and mutations"
@@ -510,7 +512,7 @@ rule translate:
     input:
         tree = refined_tree,
         node_data = rules.ancestral.output.node_data,
-        reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference, w)
+        reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference)(w)
     output:
         node_data = "results/{subtype}/{segment}/{time}/aa-muts.json"
     shell:
@@ -570,7 +572,7 @@ rule cleavage_site:
         cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json",
         cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json"
     params:
-        script = os.path.join(workflow.basedir, "scripts/annotate-ha-cleavage-site.py")
+        script = os.path.join(workflow.current_basedir, "scripts/annotate-ha-cleavage-site.py")
     shell:
         """
         python {params.script} \
@@ -613,7 +615,7 @@ rule auspice_config:
     If we implement config overlays in augur this rule will become unnecessary.
     """
     input:
-        auspice_config = lambda w: resolve_config_path(files.auspice_config, w),
+        auspice_config = resolve_config_path(files.auspice_config),
     output:
         auspice_config = "results/{subtype}/{segment}/{time}/auspice-config.json",
     run:
@@ -638,7 +640,7 @@ rule auspice_config:
 
 rule colors:
     input:
-        colors = lambda w: resolve_config_path(files.colors, w),
+        colors = resolve_config_path(files.colors),
     output:
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
     shell:
@@ -656,9 +658,9 @@ rule export:
         metadata = rules.filter.output.metadata,
         node_data = export_node_data_files,
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
-        lat_longs = lambda w: resolve_config_path(files.lat_longs, w),
+        lat_longs = resolve_config_path(files.lat_longs),
         auspice_config = rules.auspice_config.output.auspice_config,
-        description = lambda w: resolve_config_path(files.description, w),
+        description = resolve_config_path(files.description),
     output:
         auspice_json = "results/{subtype}/{segment}/{time}/auspice-dataset.json"
     params:
@@ -728,4 +730,4 @@ rule clean:
 
 
 for rule_file in config.get('custom_rules', []):
-    include: rule_file
+    include: resolve_config_path(rule_file)({})
diff --git a/gisaid/Snakefile b/gisaid/Snakefile
@@ -0,0 +1,4 @@
+include: "../Snakefile"
+
+rule _all:
+    input: rules.all.input
diff --git a/config/gisaid.yaml → gisaid/config.yaml b/config/gisaid.yaml → gisaid/config.yaml
diff --git a/h5n1-cattle-outbreak/Snakefile b/h5n1-cattle-outbreak/Snakefile
@@ -0,0 +1,4 @@
+include: "../Snakefile"
+
+rule _all:
+    input: rules.all.input
diff --git a/config/h5n1-cattle-outbreak.yaml → h5n1-cattle-outbreak/config.yaml b/config/h5n1-cattle-outbreak.yaml → h5n1-cattle-outbreak/config.yaml
@@ -1,5 +1,8 @@
 # NOTE: The h5n1-cattle-outbreak builds use a specific rule-file (specified within this config)
 # and that rule file may have some config-like parameters within it.
+# If you are extending this workflow via a --configfile overlay you and you want to add
+# your own custom rule you will need to include "rules/cattle-flu.smk" as an element in the
+# list in your overlay, as lists are not merged when configs are combined.
 custom_rules:
   - "rules/cattle-flu.smk"
 
@@ -36,6 +39,7 @@ target_sequences_per_tree: 10_000
 
 
 #### Config files ####
+
 reference: config/h5n1/reference_h5n1_{segment}.gb  # use H5N1 references
 genome_reference: config/{subtype}/h5_cattle_genome_root.gb
 auspice_config: config/{subtype}/auspice_config_{subtype}.json

diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
@@ -10,8 +10,8 @@ rule filter_segments_for_genome:
     input:
         sequences = "results/{subtype}/{genome_seg}/sequences.fasta",
         metadata = "results/{subtype}/metadata-with-clade.tsv", # TODO: use a function here instead of hardcoding
-        include = lambda w: resolve_config_path(config['include_strains'], w),
-        exclude = lambda w: resolve_config_path(config['dropped_strains'], w),
+        include = resolve_config_path(config['include_strains']),
+        exclude = resolve_config_path(config['dropped_strains']),
     output:
         sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta"
     params:
@@ -40,7 +40,7 @@ rule align_segments_for_genome:
         sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta",
         # Use the H5N1 reference sequences for alignment
         reference = lambda w: [
-            resolve_config_path(expanded, w)
+            resolve_config_path(expanded)(w)
             for expanded in
             expand(config['reference'], subtype='h5n1', segment=w.genome_seg)
         ]
@@ -76,7 +76,7 @@ rule join_segments:
         segment = 'genome',
         time = 'default',
     params:
-        script = os.path.join(workflow.basedir, "scripts/join-segments.py")
+        script = os.path.join(workflow.current_basedir, "../scripts/join-segments.py")
     shell:
         """
         python {params.script} \
@@ -147,7 +147,7 @@ rule prune_tree:
         subtype="h5n1-cattle-outbreak",
         time="default",
     params:
-        script = os.path.join(workflow.basedir, "scripts/restrict-via-common-ancestor.py")
+        script = os.path.join(workflow.current_basedir, "../scripts/restrict-via-common-ancestor.py")
     shell:
         r"""
         python3 {params.script} \
@@ -170,7 +170,7 @@ rule colors_genome:
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
     params:
         duplications = "division=division_metadata",
-        script = os.path.join(workflow.basedir, "scripts/assign-colors.py")
+        script = os.path.join(workflow.current_basedir, "../scripts/assign-colors.py")
     wildcard_constraints:
         subtype="h5n1-cattle-outbreak",
         time="default",