Skip to content

Commit

Permalink
wip! Introduce workdir portability to all workflows
Browse files Browse the repository at this point in the history
XXX FIXME: rationale, relationship to "workflows as programs"
XXX FIXME: alternatives considered (but declined) for path-in-config-value handling
XXX FIXME: document layout/structure of workdir
  • Loading branch information
tsibley committed Oct 30, 2024
1 parent 790103f commit 6584e68
Show file tree
Hide file tree
Showing 16 changed files with 119 additions and 56 deletions.
20 changes: 17 additions & 3 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,17 @@
This is the main ingest Snakefile that orchestrates the full ingest workflow
and defines its default outputs.
"""
# Utility functions shared across all workflows.
include: "../shared/functions.smk"


# Use default configuration values. Extend with Snakemake's --configfile/--config options.
configfile: os.path.join(workflow.basedir, "defaults/config.yaml")

# Use custom configuration from analysis directory (i.e. working dir), if any.
if os.path.exists("config.yaml"):
configfile: "config.yaml"

# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
Expand Down Expand Up @@ -42,4 +50,10 @@ include: "rules/nextclade.smk"
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: rule_file
# Relative custom rule paths in the config are relative to the analysis
# directory (i.e. the current working directory, or workdir, usually
# given by --directory), but the "include" directive treats relative
# paths as relative to the workflow (e.g. workflow.current_basedir).
# Convert to an absolute path based on the analysis/current directory
# to avoid this mismatch of expectations.
include: os.path.join(os.getcwd(), rule_file)
12 changes: 7 additions & 5 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ curate:
# For the Nextstrain team, this is currently
# 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "defaults/geolocation_rules.tsv"
# The path to the local geolocation rules for this pathogen.
# The path should be relative to the working directory (e.g. --directory).
# If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists).
local_geolocation_rules: "geolocation_rules.tsv"
# List of field names to change where the key is the original field name and the value is the new field name
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
Expand Down Expand Up @@ -91,8 +92,9 @@ curate:
# Name to use for the generated abbreviated authors field
abbr_authors_field: "abbr_authors"
# Path to the manual annotations file
# The path should be relative to the ingest directory
annotations: "defaults/annotations.tsv"
# The path should be relative to the working directory (e.g. --directory).
# If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists).
annotations: "annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: "accession"
# The ID field in the metadata to use as the sequence id in the output FASTA file
Expand Down
18 changes: 9 additions & 9 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ rule fetch_general_geolocation_rules:
rule concat_geolocation_rules:
input:
general_geolocation_rules="data/general-geolocation-rules.tsv",
local_geolocation_rules=config["curate"]["local_geolocation_rules"],
local_geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]),
output:
all_geolocation_rules="data/all-geolocation-rules.tsv",
shell:
Expand Down Expand Up @@ -59,7 +59,7 @@ rule curate:
sequences_ndjson="data/ncbi.ndjson",
# Change the geolocation_rules input path if you are removing the above two rules
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["curate"]["annotations"],
annotations=resolve_config_path(config["curate"]["annotations"]),
output:
metadata="data/all_metadata.tsv",
sequences="results/sequences.fasta",
Expand All @@ -86,28 +86,28 @@ rule curate:
shell:
"""
(cat {input.sequences_ndjson} \
| ./vendored/transform-field-names \
| {workflow.basedir}/vendored/transform-field-names \
--field-map {params.field_map} \
| augur curate normalize-strings \
| ./vendored/transform-strain-names \
| {workflow.basedir}/vendored/transform-strain-names \
--strain-regex {params.strain_regex} \
--backup-fields {params.strain_backup_fields} \
| augur curate format-dates \
--date-fields {params.date_fields} \
--expected-date-formats {params.expected_date_formats} \
| ./vendored/transform-genbank-location \
| {workflow.basedir}/vendored/transform-genbank-location \
| augur curate titlecase \
--titlecase-fields {params.titlecase_fields} \
--articles {params.articles} \
--abbreviations {params.abbreviations} \
| ./vendored/transform-authors \
| {workflow.basedir}/vendored/transform-authors \
--authors-field {params.authors_field} \
--default-value {params.authors_default_value} \
--abbr-authors-field {params.abbr_authors_field} \
| ./vendored/apply-geolocation-rules \
| {workflow.basedir}/vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
| ./vendored/merge-user-metadata \
| {workflow.basedir}/bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
| {workflow.basedir}/vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
| augur curate passthru \
Expand Down
9 changes: 7 additions & 2 deletions nextclade/Snakefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
configfile: "defaults/config.yaml"
include: "../shared/functions.smk"

configfile: os.path.join(workflow.basedir, "defaults/config.yaml")

if os.path.exists("config.yaml"):
configfile: "config.yaml"

rule all:
input:
Expand All @@ -13,7 +18,7 @@ include: "rules/export.smk"
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: rule_file
include: os.path.join(os.getcwd(), rule_file)

rule clean:
"""Removing directories: {params}"""
Expand Down
14 changes: 7 additions & 7 deletions nextclade/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
strain_id_field: "accession"
files:
exclude: "defaults/dropped_strains.txt"
include: "defaults/include_strains.txt"
reference_N450: "defaults/measles_reference_N450.gb"
reference_N450_fasta: "defaults/measles_reference_N450.fasta"
clades: "defaults/clades.tsv"
colors: "defaults/colors.tsv"
auspice_config: "defaults/auspice_config.json"
exclude: "dropped_strains.txt"
include: "include_strains.txt"
reference_N450: "measles_reference_N450.gb"
reference_N450_fasta: "measles_reference_N450.fasta"
clades: "clades.tsv"
colors: "colors.tsv"
auspice_config: "auspice_config.json"
align_and_extract_N450:
min_length: 400
min_seed_cover: 0.01
Expand Down
6 changes: 3 additions & 3 deletions nextclade/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rule ancestral:
node_data = "results/nt_muts.json"
params:
inference = config["ancestral"]["inference"],
reference_fasta = config["files"]["reference_N450_fasta"]
reference_fasta = resolve_config_path(config["files"]["reference_N450_fasta"])
shell:
"""
augur ancestral \
Expand All @@ -30,7 +30,7 @@ rule translate:
input:
tree = "results/tree.nwk",
node_data = "results/nt_muts.json",
reference = config["files"]["reference_N450"]
reference = resolve_config_path(config["files"]["reference_N450"])
output:
node_data = "results/aa_muts.json"
shell:
Expand All @@ -47,7 +47,7 @@ rule clades:
tree = "results/tree.nwk",
nt_muts = "results/nt_muts.json",
aa_muts = "results/aa_muts.json",
clade_defs = config["files"]["clades"]
clade_defs = resolve_config_path(config["files"]["clades"])
output:
clades = "results/clades.json"
shell:
Expand Down
4 changes: 2 additions & 2 deletions nextclade/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ rule export:
clades = "results/clades.json",
nt_muts = "results/nt_muts.json",
aa_muts = "results/aa_muts.json",
colors = config["files"]["colors"],
auspice_config = config["files"]["auspice_config"]
colors = resolve_config_path(config["files"]["colors"]),
auspice_config = resolve_config_path(config["files"]["auspice_config"])
output:
auspice_json = "auspice/measles.json"
params:
Expand Down
6 changes: 3 additions & 3 deletions nextclade/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ rule decompress:
rule align_and_extract_N450:
input:
sequences = "data/sequences.fasta",
reference = config["files"]["reference_N450_fasta"]
reference = resolve_config_path(config["files"]["reference_N450_fasta"])
output:
sequences = "results/sequences_N450.fasta"
params:
Expand All @@ -57,8 +57,8 @@ rule filter:
input:
sequences = "results/sequences_N450.fasta",
metadata = "data/metadata.tsv",
exclude = config["files"]["exclude"],
include = config["files"]["include"]
exclude = resolve_config_path(config["files"]["exclude"]),
include = resolve_config_path(config["files"]["include"])
output:
sequences = "results/aligned.fasta"
params:
Expand Down
9 changes: 7 additions & 2 deletions phylogenetic/Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
genes = ['N450', 'genome']

configfile: "defaults/config.yaml"
include: "../shared/functions.smk"

configfile: os.path.join(workflow.basedir, "defaults/config.yaml")

if os.path.exists("config.yaml"):
configfile: "config.yaml"

rule all:
input:
Expand All @@ -17,7 +22,7 @@ include: "rules/export.smk"
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: rule_file
include: os.path.join(os.getcwd(), rule_file)

rule clean:
"""Removing directories: {params}"""
Expand Down
20 changes: 10 additions & 10 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
strain_id_field: "accession"
files:
exclude: "defaults/dropped_strains.txt"
include_genome: "defaults/include_strains_genome.txt"
include_N450: "defaults/include_strains_N450.txt"
reference: "defaults/measles_reference.gb"
reference_N450: "defaults/measles_reference_N450.gb"
reference_N450_fasta: "defaults/measles_reference_N450.fasta"
colors: "defaults/colors.tsv"
auspice_config: "defaults/auspice_config.json"
auspice_config_N450: "defaults/auspice_config_N450.json"
description: "defaults/description.md"
exclude: "dropped_strains.txt"
include_genome: "include_strains_genome.txt"
include_N450: "include_strains_N450.txt"
reference: "measles_reference.gb"
reference_N450: "measles_reference_N450.gb"
reference_N450_fasta: "measles_reference_N450.fasta"
colors: "colors.tsv"
auspice_config: "auspice_config.json"
auspice_config_N450: "auspice_config_N450.json"
description: "description.md"
filter:
group_by: "country year"
sequences_per_group: 20
Expand Down
2 changes: 1 addition & 1 deletion phylogenetic/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ rule translate:
input:
tree = "results/{gene}/tree.nwk",
node_data = "results/{gene}/nt_muts.json",
reference = lambda wildcard: config["files"]["reference" if wildcard.gene == "genome" else f"reference_{wildcard.gene}"]
reference = lambda wildcard: resolve_config_path(config["files"]["reference" if wildcard.gene == "genome" else f"reference_{wildcard.gene}"])
output:
node_data = "results/{gene}/aa_muts.json"
shell:
Expand Down
6 changes: 3 additions & 3 deletions phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ rule export:
branch_lengths = "results/{gene}/branch_lengths.json",
nt_muts = "results/{gene}/nt_muts.json",
aa_muts = "results/{gene}/aa_muts.json",
colors = config["files"]["colors"],
auspice_config = lambda wildcard: config["files"]["auspice_config" if wildcard.gene == "genome" else f"auspice_config_{wildcard.gene}"],
description=config["files"]["description"]
colors = resolve_config_path(config["files"]["colors"]),
auspice_config = lambda wildcard: resolve_config_path(config["files"]["auspice_config" if wildcard.gene == "genome" else f"auspice_config_{wildcard.gene}"]),
description=resolve_config_path(config["files"]["description"])
output:
auspice_json = "auspice/measles_{gene}.json"
params:
Expand Down
6 changes: 3 additions & 3 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ rule filter:
input:
sequences = "data/sequences.fasta",
metadata = "data/metadata.tsv",
exclude = config["files"]["exclude"],
include = config["files"]["include_genome"]
exclude = resolve_config_path(config["files"]["exclude"]),
include = resolve_config_path(config["files"]["include_genome"])
output:
sequences = "results/genome/filtered.fasta"
params:
Expand Down Expand Up @@ -74,7 +74,7 @@ rule align:
"""
input:
sequences = "results/genome/filtered.fasta",
reference = config["files"]["reference"]
reference = resolve_config_path(config["files"]["reference"])
output:
alignment = "results/genome/aligned.fasta"
shell:
Expand Down
6 changes: 3 additions & 3 deletions phylogenetic/rules/prepare_sequences_N450.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ See Augur's usage docs for these commands for more details.
rule align_and_extract_N450:
input:
sequences = "data/sequences.fasta",
reference = config["files"]["reference_N450_fasta"]
reference = resolve_config_path(config["files"]["reference_N450_fasta"])
output:
sequences = "results/N450/sequences.fasta"
params:
Expand All @@ -34,8 +34,8 @@ rule filter_N450:
input:
sequences = "results/N450/sequences.fasta",
metadata = "data/metadata.tsv",
exclude = config["files"]["exclude"],
include = config["files"]["include_N450"]
exclude = resolve_config_path(config["files"]["exclude"]),
include = resolve_config_path(config["files"]["include_N450"])
output:
sequences = "results/N450/aligned.fasta"
params:
Expand Down
6 changes: 6 additions & 0 deletions shared/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Shared

> **Warning**
> Please be aware of the multiple workflows that will be affected when editing files in this directory!
This directory that holds files that are shared across multiple workflows.
31 changes: 31 additions & 0 deletions shared/functions.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os.path

def resolve_config_path(path):
"""
Resolve a relative *path* given in a configuration value.
Resolves *path* as relative to the workflow's ``defaults/`` directory (i.e.
``os.path.join(workflow.basedir, "defaults", path)``) if it doesn't exist
in the workflow's analysis directory (i.e. the current working
directory, or workdir, usually given by ``--directory`` (``-d``)).
This behaviour allows a default configuration value to point to a default
auxiliary file while also letting the file used be overridden either by
setting an alternate file path in the configuration or by creating a file
with the conventional name in the workflow's analysis directory.
"""
global workflow

if not os.path.exists(path):
# Special-case defaults/… for backwards compatibility with older
# configs. We could achieve the same behaviour with a symlink
# (defaults/defaults → .) but that seems less clear.
if path.startswith("defaults/"):
defaults_path = os.path.join(workflow.basedir, path)
else:
defaults_path = os.path.join(workflow.basedir, "defaults", path)

if os.path.exists(defaults_path):
return defaults_path

return path

0 comments on commit 6584e68

Please sign in to comment.