Skip to content

Commit

Permalink
WIP: prototype separate workflows as entrypoints
Browse files Browse the repository at this point in the history
See <https://bedfordlab.slack.com/archives/C01LCTT7JNN/p1732568407123369>
for context.

Able to be run via a number of different ways:

- From the 'avian-flu' repo:
    - `snakemake -s gisaid/Snakefile ...`
    - `cd gisaid && snakemake ...`
    - `snakemake --configfile gisaid/config.yaml`
- From a separate analysis directory, where ${AVIAN_FLU} is the path
  to the (locally checked out) avian-flu repo
    - without any config overlays: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile`
    - with a `config.yaml` overlay: (same as above)
    - with a `foo.yaml` overlay: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile --configfile foo.yaml`
  • Loading branch information
jameshadfield committed Dec 8, 2024
1 parent 96a0ded commit a519284
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 82 deletions.
154 changes: 78 additions & 76 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,81 +8,74 @@ wildcard_constraints:
SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
#SUBTYPES = ["h5n1", "h5nx", "h7n9", "h9n2"]

# ----------------------------------------------------------------------------
# Allow this to work from a separate workdir by using a config in that workdir
# which extends one of our base configs
# ----------------------------------------------------------------------------
if os.path.exists("config.yaml"):
configfile: "config.yaml"
# See commentary below
# print("This doesn't work as expected! See commentary in Snakefile", file=sys.stderr)
# exit(2)

if config.get('extends', False):
extend_path = os.path.join(workflow.basedir, "config", config['extends'])
if not os.path.isfile(extend_path):
sys.exit(f"Your config tried to extend {config['extends']!r} but this doesn't exist. It must be relative to {os.path.join(workflow.basedir, 'config')}")
configfile: extend_path

# NOTE:
# In the situation where we're running outside of the repo, and we have a custom config YAML
# such as `foo.yaml`:
# extends: h5n1-cattle-outbreak.yaml
# segments: ['pb2']
# If we run with `--configfile foo.yaml` then the merging behaviour is strange (to me!)
# We've clearly parsed the --configfile, as we have config['extends']="h5n1-cattle-outbreak.yaml",
# and we do merge in all the config values of `h5n1-cattle-outbreak.yaml` (via the above code)
# so I expected we'd therefore have config['segments']=['genome', 'pb2', 'pb1', ...]
# as defined in 'h5n1-cattle-outbreak.yaml', however we end up with only config['segments']=['pb2'].
# So it seems like the `--configfile` definitions are being re-applied a second time?!?
#
# This _is not the case_ when we use the `os.path.exists("config.yaml")` approach,
# which is why it's not going to work without the following additional update_config
# step (or something else?)
CURRENT_BASEDIR = workflow.current_basedir # TODO XXX store this value here - can't access within functions because workflow.included_stack is empty

# Load the base config.yaml relative to the entry snakefile (i.e. not this snakefile)
if os.path.exists(os.path.join(workflow.basedir, 'config.yaml')):
configfile: os.path.join(workflow.basedir, 'config.yaml')

# load a config.yaml file if it exists in the current working directory
if os.path.exists("config.yaml"):
# Following <https://github.com/snakemake/snakemake/blob/76d53290a003891c5ee41f81e8eb4821c406255d/snakemake/common/configfile.py#L7-L33>
import yte
with open("config.yaml", encoding='utf-8') as f:
overwrite_config = yte.process_yaml(f, require_use_yte=True)
snakemake.utils.update_config(config, overwrite_config)
configfile: "config.yaml"

from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove

class InvalidConfigError(Exception):
pass

def resolve_config_path(original_path, wildcards=None):
def resolve_config_path(path):
"""
Resolve a relative *path* given in a configuration value.
Resolves *path* as relative to the workflow's ``config/`` directory (i.e.
``os.path.join(workflow.basedir, "config", path)``) if it doesn't exist
in the workflow's analysis directory (i.e. the current working
directory, or workdir, usually given by ``--directory`` (``-d``)).
This behaviour allows a default configuration value to point to a default
auxiliary file while also letting the file used be overridden either by
setting an alternate file path in the configuration or by creating a file
with the conventional name in the workflow's analysis directory.
Resolve a relative *path* given in a configuration value. Returns a
function which takes a single argument *wildcards* and returns the resolved
path with any '{x}' substrings are replaced by their corresponding wildcards
filled in
Search order (first match returned):
1. Relative to the analysis directory
2. Relative to the directory the entry snakefile was in. Typically this is
not the Snakefile you are looking at now but (e.g.) the one in
avian-flu/gisaid
3. Relative to where this Snakefile is (i.e. `avian-flu/`)
"""
path = original_path.format(**wildcards) if wildcards else original_path

if re.search(r'\{.+\}', path):
if wildcards:
print(f"The call to `resolve_config_path({original_path!r}, wildcards)` still includes wildcard-like placeholders afrer resolving: {path!r}.", file=sys.stderr)
else:
print(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.", file=sys.stderr)
exit(2)

if not os.path.exists(path):
# Check if the path exists relative to the basedir. This catches things like "config/…"
# as well as "clade-labeling/h5n1-clades.tsv"
basepath = os.path.join(workflow.basedir, path)
if not isinstance(path, str):
raise InvalidConfigError(f"Config path provided to resolve_config_path must be a string. Provided value: {str(path)}")

def resolve(wildcards):
try:
path_expanded = expand(path, **wildcards)[0]
except snakemake.exceptions.WildcardError as e:
# str(e) looks like "No values given for wildcard 'subtypes'."
raise InvalidConfigError(f"resolve_config_path called with path {path!r} however {str(e)}")

# check if the path exists relative to the working directory
if os.path.exists(path_expanded):
# return an absolute path so that we can use it in different contexts within snakemake
# e.g. snakemake interprets paths in rules as relative to the working directory but
# the "include" directive treats them as relative to the calling snakefile
return os.path.join(os.getcwd(), path_expanded)

# Check if the path exists relative to the subdir where the entry snakefile is
# (e.g. avian-flu/gisaid). If you want to use further subdirectories (e.g. avian-flu/gisaid/config/x.tsv)
# you're expected to supply the 'config/x.tsv' as the value in the config YAML
# NOTE: this means analysis directory overrides have to use that same 'config/x.tsv' structure, but
# given the different directories avian-flu uses that's acceptable. In other words, if we standardised
# avian-flu then we could add subdirectories to the search order here
basepath = os.path.join(workflow.basedir, path_expanded)
if os.path.exists(basepath):
return basepath

print(f"Unable to resolve the path {path!r} either within the working directory or within {workflow.basedir!r}", file=sys.stderr)
exit(2)

return path
# Check if the path exists relative to where _this_ snakefile is, i.e. relative to `avian-flu/`.
if workflow.basedir != CURRENT_BASEDIR:
basepath = os.path.join(CURRENT_BASEDIR, path_expanded)
if os.path.exists(basepath):
return basepath

raise InvalidConfigError(f"Unable to resolve the config-provided path {path!r}, expanded to {path_expanded!r} after filling in wildcards. "
f"The following directories were searched:\n"
f"\t1. {os.path.abspath(os.curdir)} (current working directory)\n"
f"\t2. {workflow.basedir} (where the entry snakefile is)\n"
f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n")
return resolve

# The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
# will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps:
Expand All @@ -95,6 +88,14 @@ S3_SRC = config.get('s3_src', {})
LOCAL_INGEST = config.get('local_ingest', None)

def sanity_check_config():
if not len(config.keys()):
print("-"*80 + "\nNo config loaded!", file=sys.stderr)
print("Avian-flu is indented to be run from the snakefile inside a subdir "
"(e.g. gisaid/Snakefile) which will pick up the default configfile for that workflow. "
"Alternatively you can pass in the config via `--configfile`", file=sys.stderr)
print("-"*80, file=sys.stderr)
raise InvalidConfigError("No config")

assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'"
# NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST
# if we want to use `--config local_ingest=gisaid` overrides.
Expand Down Expand Up @@ -128,6 +129,7 @@ rule all:


# This must be after the `all` rule above since it depends on its inputs
# Note this is relative to the `workflow.current_basedir`
include: "rules/deploy.smk"

rule test_target:
Expand Down Expand Up @@ -289,11 +291,11 @@ rule add_h5_clade:
message: "Adding in a column for h5 clade numbering"
input:
metadata = "results/{subtype}/metadata.tsv",
clades_file = lambda w: resolve_config_path(files.clades_file, w)
clades_file = resolve_config_path(files.clades_file)
output:
metadata= "results/{subtype}/metadata-with-clade.tsv"
params:
script = os.path.join(workflow.basedir, "clade-labeling/add-clades.py")
script = os.path.join(workflow.current_basedir, "clade-labeling/add-clades.py")
shell:
r"""
python {params.script} \
Expand Down Expand Up @@ -349,8 +351,8 @@ rule filter:
input:
sequences = "results/{subtype}/{segment}/sequences.fasta",
metadata = metadata_by_wildcards,
exclude = lambda w: resolve_config_path(files.dropped_strains, w),
include = lambda w: resolve_config_path(files.include_strains, w),
exclude = resolve_config_path(files.dropped_strains),
include = resolve_config_path(files.include_strains),
strains = lambda w: f"results/{w.subtype}/ha/{w.time}/filtered.txt" if (SAME_STRAINS and w.segment!='ha') else [],
output:
sequences = "results/{subtype}/{segment}/{time}/filtered.fasta",
Expand Down Expand Up @@ -381,7 +383,7 @@ rule align:
"""
input:
sequences = rules.filter.output.sequences,
reference = lambda w: resolve_config_path(files.reference, w)
reference = resolve_config_path(files.reference)
output:
alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
wildcard_constraints:
Expand Down Expand Up @@ -482,7 +484,7 @@ def ancestral_root_seq(wildcards):
else get_config('ancestral', 'root_seq', wildcards)
if not root_seq:
return ""
return f"--root-sequence {resolve_config_path(root_seq, wildcards)}"
return f"--root-sequence {resolve_config_path(root_seq)(wildcards)}"

rule ancestral:
message: "Reconstructing ancestral sequences and mutations"
Expand Down Expand Up @@ -510,7 +512,7 @@ rule translate:
input:
tree = refined_tree,
node_data = rules.ancestral.output.node_data,
reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference, w)
reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference)(w)
output:
node_data = "results/{subtype}/{segment}/{time}/aa-muts.json"
shell:
Expand Down Expand Up @@ -570,7 +572,7 @@ rule cleavage_site:
cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json",
cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json"
params:
script = os.path.join(workflow.basedir, "scripts/annotate-ha-cleavage-site.py")
script = os.path.join(workflow.current_basedir, "scripts/annotate-ha-cleavage-site.py")
shell:
"""
python {params.script} \
Expand Down Expand Up @@ -613,7 +615,7 @@ rule auspice_config:
If we implement config overlays in augur this rule will become unnecessary.
"""
input:
auspice_config = lambda w: resolve_config_path(files.auspice_config, w),
auspice_config = resolve_config_path(files.auspice_config),
output:
auspice_config = "results/{subtype}/{segment}/{time}/auspice-config.json",
run:
Expand All @@ -638,7 +640,7 @@ rule auspice_config:

rule colors:
input:
colors = lambda w: resolve_config_path(files.colors, w),
colors = resolve_config_path(files.colors),
output:
colors = "results/{subtype}/{segment}/{time}/colors.tsv",
shell:
Expand All @@ -656,9 +658,9 @@ rule export:
metadata = rules.filter.output.metadata,
node_data = export_node_data_files,
colors = "results/{subtype}/{segment}/{time}/colors.tsv",
lat_longs = lambda w: resolve_config_path(files.lat_longs, w),
lat_longs = resolve_config_path(files.lat_longs),
auspice_config = rules.auspice_config.output.auspice_config,
description = lambda w: resolve_config_path(files.description, w),
description = resolve_config_path(files.description),
output:
auspice_json = "results/{subtype}/{segment}/{time}/auspice-dataset.json"
params:
Expand Down Expand Up @@ -728,4 +730,4 @@ rule clean:


for rule_file in config.get('custom_rules', []):
include: rule_file
include: resolve_config_path(rule_file)({})
4 changes: 4 additions & 0 deletions gisaid/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: "../Snakefile"

rule _all:
input: rules.all.input
File renamed without changes.
4 changes: 4 additions & 0 deletions h5n1-cattle-outbreak/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: "../Snakefile"

rule _all:
input: rules.all.input
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# NOTE: The h5n1-cattle-outbreak builds use a specific rule-file (specified within this config)
# and that rule file may have some config-like parameters within it.
# If you are extending this workflow via a --configfile overlay you and you want to add
# your own custom rule you will need to include "rules/cattle-flu.smk" as an element in the
# list in your overlay, as lists are not merged when configs are combined.
custom_rules:
- "rules/cattle-flu.smk"

Expand Down Expand Up @@ -36,6 +39,7 @@ target_sequences_per_tree: 10_000


#### Config files ####

reference: config/h5n1/reference_h5n1_{segment}.gb # use H5N1 references
genome_reference: config/{subtype}/h5_cattle_genome_root.gb
auspice_config: config/{subtype}/auspice_config_{subtype}.json
Expand Down
12 changes: 6 additions & 6 deletions rules/cattle-flu.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ rule filter_segments_for_genome:
input:
sequences = "results/{subtype}/{genome_seg}/sequences.fasta",
metadata = "results/{subtype}/metadata-with-clade.tsv", # TODO: use a function here instead of hardcoding
include = lambda w: resolve_config_path(config['include_strains'], w),
exclude = lambda w: resolve_config_path(config['dropped_strains'], w),
include = resolve_config_path(config['include_strains']),
exclude = resolve_config_path(config['dropped_strains']),
output:
sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta"
params:
Expand Down Expand Up @@ -40,7 +40,7 @@ rule align_segments_for_genome:
sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta",
# Use the H5N1 reference sequences for alignment
reference = lambda w: [
resolve_config_path(expanded, w)
resolve_config_path(expanded)(w)
for expanded in
expand(config['reference'], subtype='h5n1', segment=w.genome_seg)
]
Expand Down Expand Up @@ -76,7 +76,7 @@ rule join_segments:
segment = 'genome',
time = 'default',
params:
script = os.path.join(workflow.basedir, "scripts/join-segments.py")
script = os.path.join(workflow.current_basedir, "../scripts/join-segments.py")
shell:
"""
python {params.script} \
Expand Down Expand Up @@ -147,7 +147,7 @@ rule prune_tree:
subtype="h5n1-cattle-outbreak",
time="default",
params:
script = os.path.join(workflow.basedir, "scripts/restrict-via-common-ancestor.py")
script = os.path.join(workflow.current_basedir, "../scripts/restrict-via-common-ancestor.py")
shell:
r"""
python3 {params.script} \
Expand All @@ -170,7 +170,7 @@ rule colors_genome:
colors = "results/{subtype}/{segment}/{time}/colors.tsv",
params:
duplications = "division=division_metadata",
script = os.path.join(workflow.basedir, "scripts/assign-colors.py")
script = os.path.join(workflow.current_basedir, "../scripts/assign-colors.py")
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
time="default",
Expand Down

0 comments on commit a519284

Please sign in to comment.