Skip to content

Commit

Permalink
WIP - allow running workflow from outside of repo
Browse files Browse the repository at this point in the history
  • Loading branch information
jameshadfield committed Nov 20, 2024
1 parent da8a4f9 commit e6b070d
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 21 deletions.
110 changes: 97 additions & 13 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,82 @@ wildcard_constraints:
# defined before extra rules `include`d as they reference this constant
SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]

# ----------------------------------------------------------------------------
# Allow this to work from a separate workdir by using a config in that workdir
# which extends one of our base configs
# ----------------------------------------------------------------------------
if os.path.exists("config.yaml"):
configfile: "config.yaml"
# See commentary below
# print("This doesn't work as expected! See commentary in Snakefile", file=sys.stderr)
# exit(2)

if config.get('extends', False):
extend_path = os.path.join(workflow.basedir, "config", config['extends'])
if not os.path.isfile(extend_path):
sys.exit(f"Your config tried to extend {config['extends']!r} but this doesn't exist. It must be relative to {os.path.join(workflow.basedir, 'config')}")
configfile: extend_path

# NOTE:
# In the situation where we're running outside of the repo, and we have a custom config YAML
# such as `foo.yaml`:
# extends: h5n1-cattle-outbreak.yaml
# segments: ['pb2']
# If we run with `--configfile foo.yaml` then the merging behaviour is strange (to me!)
# We've clearly parsed the --configfile, as we have config['extends']="h5n1-cattle-outbreak.yaml",
# and we do merge in all the config values of `h5n1-cattle-outbreak.yaml` (via the above code)
# so I expected we'd therefore have config['segments']=['genome', 'pb2', 'pb1', ...]
# as defined in 'h5n1-cattle-outbreak.yaml', however we end up with only config['segments']=['pb2'].
# So it seems like the `--configfile` definitions are being re-applied a second time?!?
#
# This _is not the case_ when we use the `os.path.exists("config.yaml")` approach,
# which is why it's not going to work without the following additional update_config
# step (or something else?)

if os.path.exists("config.yaml"):
# Following <https://github.com/snakemake/snakemake/blob/76d53290a003891c5ee41f81e8eb4821c406255d/snakemake/common/configfile.py#L7-L33>
import yte
with open("config.yaml", encoding='utf-8') as f:
overwrite_config = yte.process_yaml(f, require_use_yte=True)
snakemake.utils.update_config(config, overwrite_config)

from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove


def resolve_config_path(original_path, wildcards=None):
"""
Resolve a relative *path* given in a configuration value.
Resolves *path* as relative to the workflow's ``config/`` directory (i.e.
``os.path.join(workflow.basedir, "config", path)``) if it doesn't exist
in the workflow's analysis directory (i.e. the current working
directory, or workdir, usually given by ``--directory`` (``-d``)).
This behaviour allows a default configuration value to point to a default
auxiliary file while also letting the file used be overridden either by
setting an alternate file path in the configuration or by creating a file
with the conventional name in the workflow's analysis directory.
"""
path = original_path.format(**wildcards) if wildcards else original_path

if re.search(r'\{.+\}', path):
if wildcards:
print(f"The call to `resolve_config_path({original_path!r}, wildcards)` still includes wildcard-like placeholders afrer resolving: {path!r}.", file=sys.stderr)
else:
print(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.", file=sys.stderr)
exit(2)

if not os.path.exists(path):
# Check if the path exists relative to the basedir. This catches things like "config/…"
# as well as "clade-labeling/h5n1-clades.tsv"
basepath = os.path.join(workflow.basedir, path)
if os.path.exists(basepath):
return basepath

print(f"Unable to resolve the path {path!r} either within the working directory or within {workflow.basedir!r}", file=sys.stderr)
exit(2)

return path


# The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
# will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps:
# (1) Filter the HA segment as normal plus filter to those strains with 8 segments
Expand Down Expand Up @@ -56,6 +132,8 @@ rule test_target:
"""
input: "auspice/avian-flu_h5n1_ha_all-time.json"

# TODO - I find this indirection more confusing than helpful and I'd rather just
# specify `config['colors']` in the rules which use it (e.g.)
rule files:
params:
dropped_strains = config['dropped_strains'],
Expand Down Expand Up @@ -207,12 +285,14 @@ rule add_h5_clade:
message: "Adding in a column for h5 clade numbering"
input:
metadata = "results/{subtype}/metadata.tsv",
clades_file = files.clades_file
clades_file = lambda w: resolve_config_path(files.clades_file, w)
output:
metadata= "results/{subtype}/metadata-with-clade.tsv"
params:
script = os.path.join(workflow.basedir, "clade-labeling/add-clades.py")
shell:
"""
python clade-labeling/add-clades.py \
r"""
python {params.script} \
--metadata {input.metadata} \
--output {output.metadata} \
--clades {input.clades_file}
Expand Down Expand Up @@ -265,8 +345,8 @@ rule filter:
input:
sequences = "results/{subtype}/{segment}/sequences.fasta",
metadata = metadata_by_wildcards,
exclude = files.dropped_strains,
include = files.include_strains,
exclude = lambda w: resolve_config_path(files.dropped_strains, w),
include = lambda w: resolve_config_path(files.include_strains, w),
strains = lambda w: f"results/{w.subtype}/ha/{w.time}/filtered.txt" if (SAME_STRAINS and w.segment!='ha') else [],
output:
sequences = "results/{subtype}/{segment}/{time}/filtered.fasta",
Expand Down Expand Up @@ -297,7 +377,7 @@ rule align:
"""
input:
sequences = rules.filter.output.sequences,
reference = files.reference
reference = lambda w: resolve_config_path(files.reference, w)
output:
alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
wildcard_constraints:
Expand Down Expand Up @@ -396,7 +476,9 @@ def ancestral_root_seq(wildcards):
root_seq = get_config('ancestral', 'genome_root_seq', wildcards) \
if wildcards.segment=='genome' \
else get_config('ancestral', 'root_seq', wildcards)
return f"--root-sequence {root_seq}" if root_seq else ""
if not root_seq:
return ""
return f"--root-sequence {resolve_config_path(root_seq, wildcards)}"

rule ancestral:
message: "Reconstructing ancestral sequences and mutations"
Expand Down Expand Up @@ -424,7 +506,7 @@ rule translate:
input:
tree = refined_tree,
node_data = rules.ancestral.output.node_data,
reference = lambda w: config['genome_reference'] if w.segment=='genome' else files.reference
reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference, w)
output:
node_data = "results/{subtype}/{segment}/{time}/aa-muts.json"
shell:
Expand Down Expand Up @@ -483,9 +565,11 @@ rule cleavage_site:
output:
cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json",
cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json"
params:
script = os.path.join(workflow.basedir, "scripts/annotate-ha-cleavage-site.py")
shell:
"""
python scripts/annotate-ha-cleavage-site.py \
python {params.script} \
--alignment {input.alignment} \
--furin_site_motif {output.cleavage_site_annotations} \
--cleavage_site_sequence {output.cleavage_site_sequences}
Expand Down Expand Up @@ -525,7 +609,7 @@ rule auspice_config:
If we implement config overlays in augur this rule will become unnecessary.
"""
input:
auspice_config = files.auspice_config,
auspice_config = lambda w: resolve_config_path(files.auspice_config, w),
output:
auspice_config = "results/{subtype}/{segment}/{time}/auspice-config.json",
run:
Expand All @@ -550,7 +634,7 @@ rule auspice_config:

rule colors:
input:
colors = files.colors,
colors = lambda w: resolve_config_path(files.colors, w),
output:
colors = "results/{subtype}/{segment}/{time}/colors.tsv",
shell:
Expand All @@ -568,9 +652,9 @@ rule export:
metadata = rules.filter.output.metadata,
node_data = export_node_data_files,
colors = "results/{subtype}/{segment}/{time}/colors.tsv",
lat_longs = files.lat_longs,
lat_longs = lambda w: resolve_config_path(files.lat_longs, w),
auspice_config = rules.auspice_config.output.auspice_config,
description = files.description
description = lambda w: resolve_config_path(files.description, w),
output:
auspice_json = "results/{subtype}/{segment}/{time}/auspice-dataset.json"
params:
Expand Down
26 changes: 18 additions & 8 deletions rules/cattle-flu.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ rule filter_segments_for_genome:
input:
sequences = "results/{subtype}/{genome_seg}/sequences.fasta",
metadata = "results/{subtype}/metadata-with-clade.tsv", # TODO: use a function here instead of hardcoding
include = config['include_strains'],
exclude = config['dropped_strains'],
include = lambda w: resolve_config_path(config['include_strains'], w),
exclude = lambda w: resolve_config_path(config['dropped_strains'], w),
output:
sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta"
params:
Expand Down Expand Up @@ -39,7 +39,11 @@ rule align_segments_for_genome:
input:
sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta",
# Use the H5N1 reference sequences for alignment
reference = lambda w: expand(config['reference'], subtype='h5n1', segment=w.genome_seg)
reference = lambda w: [
resolve_config_path(expanded, w)
for expanded in
expand(config['reference'], subtype='h5n1', segment=w.genome_seg)
]
output:
alignment = "results/{subtype}/{segment}/{time}/aligned_{genome_seg}.fasta"
wildcard_constraints:
Expand Down Expand Up @@ -71,9 +75,11 @@ rule join_segments:
subtype = 'h5n1-cattle-outbreak',
segment = 'genome',
time = 'default',
params:
script = os.path.join(workflow.basedir, "scripts/join-segments.py")
shell:
"""
python scripts/join-segments.py \
python {params.script} \
--segments {input.alignment} \
--output {output.alignment}
"""
Expand Down Expand Up @@ -140,9 +146,11 @@ rule prune_tree:
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
time="default",
params:
script = os.path.join(workflow.basedir, "scripts/restrict-via-common-ancestor.py")
shell:
"""
python3 scripts/restrict-via-common-ancestor.py \
r"""
python3 {params.script} \
--tree {input.tree} \
--strains {input.strains} \
--output-tree {output.tree} \
Expand All @@ -165,10 +173,12 @@ rule colors_genome:
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
time="default",
params:
script = os.path.join(workflow.basedir, "scripts/assign-colors.py")
shell:
"""
r"""
cp {input.colors} {output.colors} && \
python3 scripts/assign-colors.py \
python3 {params.script} \ \
--metadata {input.metadata} \
--ordering {input.ordering} \
--color-schemes {input.schemes} \
Expand Down

0 comments on commit e6b070d

Please sign in to comment.