From 6584e68622dd305592b98d8630fa94e191b4f164 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 30 Oct 2024 12:50:37 -0700 Subject: [PATCH] wip! Introduce workdir portability to all workflows XXX FIXME: rationale, relationship to "workflows as programs" XXX FIXME: alternatives considered (but declined) for path-in-config-value handling XXX FIXME: document layout/structure of workdir --- ingest/Snakefile | 20 ++++++++++-- ingest/defaults/config.yaml | 12 ++++--- ingest/rules/curate.smk | 18 +++++------ nextclade/Snakefile | 9 ++++-- nextclade/defaults/config.yaml | 14 ++++----- nextclade/rules/annotate_phylogeny.smk | 6 ++-- nextclade/rules/export.smk | 4 +-- nextclade/rules/prepare_sequences.smk | 6 ++-- phylogenetic/Snakefile | 9 ++++-- phylogenetic/defaults/config.yaml | 20 ++++++------ phylogenetic/rules/annotate_phylogeny.smk | 2 +- phylogenetic/rules/export.smk | 6 ++-- phylogenetic/rules/prepare_sequences.smk | 6 ++-- phylogenetic/rules/prepare_sequences_N450.smk | 6 ++-- shared/README.md | 6 ++++ shared/functions.smk | 31 +++++++++++++++++++ 16 files changed, 119 insertions(+), 56 deletions(-) create mode 100644 shared/README.md create mode 100644 shared/functions.smk diff --git a/ingest/Snakefile b/ingest/Snakefile index c7d297d..056f1d5 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -2,9 +2,17 @@ This is the main ingest Snakefile that orchestrates the full ingest workflow and defines its default outputs. """ +# Utility functions shared across all workflows. +include: "../shared/functions.smk" + + +# Use default configuration values. Extend with Snakemake's --configfile/--config options. +configfile: os.path.join(workflow.basedir, "defaults/config.yaml") + +# Use custom configuration from analysis directory (i.e. working dir), if any. +if os.path.exists("config.yaml"): + configfile: "config.yaml" -# Use default configuration values. Override with Snakemake's --configfile/--config options. -configfile: "defaults/config.yaml" # This is the default rule that Snakemake will run when there are no specified targets. # The default output of the ingest workflow is usually the curated metadata and sequences. @@ -42,4 +50,10 @@ include: "rules/nextclade.smk" if "custom_rules" in config: for rule_file in config["custom_rules"]: - include: rule_file + # Relative custom rule paths in the config are relative to the analysis + # directory (i.e. the current working directory, or workdir, usually + # given by --directory), but the "include" directive treats relative + # paths as relative to the workflow (e.g. workflow.current_basedir). + # Convert to an absolute path based on the analysis/current directory + # to avoid this mismatch of expectations. + include: os.path.join(os.getcwd(), rule_file) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index d3d2ca3..504b132 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -37,9 +37,10 @@ curate: # For the Nextstrain team, this is currently # 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv' geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" - # The path to the local geolocation rules within the pathogen repo - # The path should be relative to the ingest directory. - local_geolocation_rules: "defaults/geolocation_rules.tsv" + # The path to the local geolocation rules for this pathogen. + # The path should be relative to the working directory (e.g. --directory). + # If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists). + local_geolocation_rules: "geolocation_rules.tsv" # List of field names to change where the key is the original field name and the value is the new field name # The original field names should match the ncbi_datasets_fields provided above. # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names @@ -91,8 +92,9 @@ curate: # Name to use for the generated abbreviated authors field abbr_authors_field: "abbr_authors" # Path to the manual annotations file - # The path should be relative to the ingest directory - annotations: "defaults/annotations.tsv" + # The path should be relative to the working directory (e.g. --directory). + # If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists). + annotations: "annotations.tsv" # The ID field in the metadata to use to merge the manual annotations annotations_id: "accession" # The ID field in the metadata to use as the sequence id in the output FASTA file diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 2bfeb9d..0c30a1d 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -31,7 +31,7 @@ rule fetch_general_geolocation_rules: rule concat_geolocation_rules: input: general_geolocation_rules="data/general-geolocation-rules.tsv", - local_geolocation_rules=config["curate"]["local_geolocation_rules"], + local_geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]), output: all_geolocation_rules="data/all-geolocation-rules.tsv", shell: @@ -59,7 +59,7 @@ rule curate: sequences_ndjson="data/ncbi.ndjson", # Change the geolocation_rules input path if you are removing the above two rules all_geolocation_rules="data/all-geolocation-rules.tsv", - annotations=config["curate"]["annotations"], + annotations=resolve_config_path(config["curate"]["annotations"]), output: metadata="data/all_metadata.tsv", sequences="results/sequences.fasta", @@ -86,28 +86,28 @@ rule curate: shell: """ (cat {input.sequences_ndjson} \ - | ./vendored/transform-field-names \ + | {workflow.basedir}/vendored/transform-field-names \ --field-map {params.field_map} \ | augur curate normalize-strings \ - | ./vendored/transform-strain-names \ + | {workflow.basedir}/vendored/transform-strain-names \ --strain-regex {params.strain_regex} \ --backup-fields {params.strain_backup_fields} \ | augur curate format-dates \ --date-fields {params.date_fields} \ --expected-date-formats {params.expected_date_formats} \ - | ./vendored/transform-genbank-location \ + | {workflow.basedir}/vendored/transform-genbank-location \ | augur curate titlecase \ --titlecase-fields {params.titlecase_fields} \ --articles {params.articles} \ --abbreviations {params.abbreviations} \ - | ./vendored/transform-authors \ + | {workflow.basedir}/vendored/transform-authors \ --authors-field {params.authors_field} \ --default-value {params.authors_default_value} \ --abbr-authors-field {params.abbr_authors_field} \ - | ./vendored/apply-geolocation-rules \ + | {workflow.basedir}/vendored/apply-geolocation-rules \ --geolocation-rules {input.all_geolocation_rules} \ - | ./bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \ - | ./vendored/merge-user-metadata \ + | {workflow.basedir}/bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \ + | {workflow.basedir}/vendored/merge-user-metadata \ --annotations {input.annotations} \ --id-field {params.annotations_id} \ | augur curate passthru \ diff --git a/nextclade/Snakefile b/nextclade/Snakefile index dd876ff..8c2f20a 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -1,4 +1,9 @@ -configfile: "defaults/config.yaml" +include: "../shared/functions.smk" + +configfile: os.path.join(workflow.basedir, "defaults/config.yaml") + +if os.path.exists("config.yaml"): + configfile: "config.yaml" rule all: input: @@ -13,7 +18,7 @@ include: "rules/export.smk" if "custom_rules" in config: for rule_file in config["custom_rules"]: - include: rule_file + include: os.path.join(os.getcwd(), rule_file) rule clean: """Removing directories: {params}""" diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml index 41895a9..8964c9d 100644 --- a/nextclade/defaults/config.yaml +++ b/nextclade/defaults/config.yaml @@ -1,12 +1,12 @@ strain_id_field: "accession" files: - exclude: "defaults/dropped_strains.txt" - include: "defaults/include_strains.txt" - reference_N450: "defaults/measles_reference_N450.gb" - reference_N450_fasta: "defaults/measles_reference_N450.fasta" - clades: "defaults/clades.tsv" - colors: "defaults/colors.tsv" - auspice_config: "defaults/auspice_config.json" + exclude: "dropped_strains.txt" + include: "include_strains.txt" + reference_N450: "measles_reference_N450.gb" + reference_N450_fasta: "measles_reference_N450.fasta" + clades: "clades.tsv" + colors: "colors.tsv" + auspice_config: "auspice_config.json" align_and_extract_N450: min_length: 400 min_seed_cover: 0.01 diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk index 5726f92..97b1194 100644 --- a/nextclade/rules/annotate_phylogeny.smk +++ b/nextclade/rules/annotate_phylogeny.smk @@ -14,7 +14,7 @@ rule ancestral: node_data = "results/nt_muts.json" params: inference = config["ancestral"]["inference"], - reference_fasta = config["files"]["reference_N450_fasta"] + reference_fasta = resolve_config_path(config["files"]["reference_N450_fasta"]) shell: """ augur ancestral \ @@ -30,7 +30,7 @@ rule translate: input: tree = "results/tree.nwk", node_data = "results/nt_muts.json", - reference = config["files"]["reference_N450"] + reference = resolve_config_path(config["files"]["reference_N450"]) output: node_data = "results/aa_muts.json" shell: @@ -47,7 +47,7 @@ rule clades: tree = "results/tree.nwk", nt_muts = "results/nt_muts.json", aa_muts = "results/aa_muts.json", - clade_defs = config["files"]["clades"] + clade_defs = resolve_config_path(config["files"]["clades"]) output: clades = "results/clades.json" shell: diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk index bc08b8c..18a3923 100644 --- a/nextclade/rules/export.smk +++ b/nextclade/rules/export.smk @@ -14,8 +14,8 @@ rule export: clades = "results/clades.json", nt_muts = "results/nt_muts.json", aa_muts = "results/aa_muts.json", - colors = config["files"]["colors"], - auspice_config = config["files"]["auspice_config"] + colors = resolve_config_path(config["files"]["colors"]), + auspice_config = resolve_config_path(config["files"]["auspice_config"]) output: auspice_json = "auspice/measles.json" params: diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk index 428a0de..5ab7dc3 100644 --- a/nextclade/rules/prepare_sequences.smk +++ b/nextclade/rules/prepare_sequences.smk @@ -34,7 +34,7 @@ rule decompress: rule align_and_extract_N450: input: sequences = "data/sequences.fasta", - reference = config["files"]["reference_N450_fasta"] + reference = resolve_config_path(config["files"]["reference_N450_fasta"]) output: sequences = "results/sequences_N450.fasta" params: @@ -57,8 +57,8 @@ rule filter: input: sequences = "results/sequences_N450.fasta", metadata = "data/metadata.tsv", - exclude = config["files"]["exclude"], - include = config["files"]["include"] + exclude = resolve_config_path(config["files"]["exclude"]), + include = resolve_config_path(config["files"]["include"]) output: sequences = "results/aligned.fasta" params: diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 0acf1a7..937b4b8 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,6 +1,11 @@ genes = ['N450', 'genome'] -configfile: "defaults/config.yaml" +include: "../shared/functions.smk" + +configfile: os.path.join(workflow.basedir, "defaults/config.yaml") + +if os.path.exists("config.yaml"): + configfile: "config.yaml" rule all: input: @@ -17,7 +22,7 @@ include: "rules/export.smk" if "custom_rules" in config: for rule_file in config["custom_rules"]: - include: rule_file + include: os.path.join(os.getcwd(), rule_file) rule clean: """Removing directories: {params}""" diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 01ff2ba..eda6802 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -1,15 +1,15 @@ strain_id_field: "accession" files: - exclude: "defaults/dropped_strains.txt" - include_genome: "defaults/include_strains_genome.txt" - include_N450: "defaults/include_strains_N450.txt" - reference: "defaults/measles_reference.gb" - reference_N450: "defaults/measles_reference_N450.gb" - reference_N450_fasta: "defaults/measles_reference_N450.fasta" - colors: "defaults/colors.tsv" - auspice_config: "defaults/auspice_config.json" - auspice_config_N450: "defaults/auspice_config_N450.json" - description: "defaults/description.md" + exclude: "dropped_strains.txt" + include_genome: "include_strains_genome.txt" + include_N450: "include_strains_N450.txt" + reference: "measles_reference.gb" + reference_N450: "measles_reference_N450.gb" + reference_N450_fasta: "measles_reference_N450.fasta" + colors: "colors.tsv" + auspice_config: "auspice_config.json" + auspice_config_N450: "auspice_config_N450.json" + description: "description.md" filter: group_by: "country year" sequences_per_group: 20 diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index ea8f4e1..e27958a 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -28,7 +28,7 @@ rule translate: input: tree = "results/{gene}/tree.nwk", node_data = "results/{gene}/nt_muts.json", - reference = lambda wildcard: config["files"]["reference" if wildcard.gene == "genome" else f"reference_{wildcard.gene}"] + reference = lambda wildcard: resolve_config_path(config["files"]["reference" if wildcard.gene == "genome" else f"reference_{wildcard.gene}"]) output: node_data = "results/{gene}/aa_muts.json" shell: diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 53ca492..5586153 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -13,9 +13,9 @@ rule export: branch_lengths = "results/{gene}/branch_lengths.json", nt_muts = "results/{gene}/nt_muts.json", aa_muts = "results/{gene}/aa_muts.json", - colors = config["files"]["colors"], - auspice_config = lambda wildcard: config["files"]["auspice_config" if wildcard.gene == "genome" else f"auspice_config_{wildcard.gene}"], - description=config["files"]["description"] + colors = resolve_config_path(config["files"]["colors"]), + auspice_config = lambda wildcard: resolve_config_path(config["files"]["auspice_config" if wildcard.gene == "genome" else f"auspice_config_{wildcard.gene}"]), + description=resolve_config_path(config["files"]["description"]) output: auspice_json = "auspice/measles_{gene}.json" params: diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 6d5bf85..730ff09 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -42,8 +42,8 @@ rule filter: input: sequences = "data/sequences.fasta", metadata = "data/metadata.tsv", - exclude = config["files"]["exclude"], - include = config["files"]["include_genome"] + exclude = resolve_config_path(config["files"]["exclude"]), + include = resolve_config_path(config["files"]["include_genome"]) output: sequences = "results/genome/filtered.fasta" params: @@ -74,7 +74,7 @@ rule align: """ input: sequences = "results/genome/filtered.fasta", - reference = config["files"]["reference"] + reference = resolve_config_path(config["files"]["reference"]) output: alignment = "results/genome/aligned.fasta" shell: diff --git a/phylogenetic/rules/prepare_sequences_N450.smk b/phylogenetic/rules/prepare_sequences_N450.smk index d009c25..dc211e4 100644 --- a/phylogenetic/rules/prepare_sequences_N450.smk +++ b/phylogenetic/rules/prepare_sequences_N450.smk @@ -7,7 +7,7 @@ See Augur's usage docs for these commands for more details. rule align_and_extract_N450: input: sequences = "data/sequences.fasta", - reference = config["files"]["reference_N450_fasta"] + reference = resolve_config_path(config["files"]["reference_N450_fasta"]) output: sequences = "results/N450/sequences.fasta" params: @@ -34,8 +34,8 @@ rule filter_N450: input: sequences = "results/N450/sequences.fasta", metadata = "data/metadata.tsv", - exclude = config["files"]["exclude"], - include = config["files"]["include_N450"] + exclude = resolve_config_path(config["files"]["exclude"]), + include = resolve_config_path(config["files"]["include_N450"]) output: sequences = "results/N450/aligned.fasta" params: diff --git a/shared/README.md b/shared/README.md new file mode 100644 index 0000000..8e4c2d0 --- /dev/null +++ b/shared/README.md @@ -0,0 +1,6 @@ +# Shared + +> **Warning** +> Please be aware of the multiple workflows that will be affected when editing files in this directory! + +This directory that holds files that are shared across multiple workflows. diff --git a/shared/functions.smk b/shared/functions.smk new file mode 100644 index 0000000..3b2b8b8 --- /dev/null +++ b/shared/functions.smk @@ -0,0 +1,31 @@ +import os.path + +def resolve_config_path(path): + """ + Resolve a relative *path* given in a configuration value. + + Resolves *path* as relative to the workflow's ``defaults/`` directory (i.e. + ``os.path.join(workflow.basedir, "defaults", path)``) if it doesn't exist + in the workflow's analysis directory (i.e. the current working + directory, or workdir, usually given by ``--directory`` (``-d``)). + + This behaviour allows a default configuration value to point to a default + auxiliary file while also letting the file used be overridden either by + setting an alternate file path in the configuration or by creating a file + with the conventional name in the workflow's analysis directory. + """ + global workflow + + if not os.path.exists(path): + # Special-case defaults/… for backwards compatibility with older + # configs. We could achieve the same behaviour with a symlink + # (defaults/defaults → .) but that seems less clear. + if path.startswith("defaults/"): + defaults_path = os.path.join(workflow.basedir, path) + else: + defaults_path = os.path.join(workflow.basedir, "defaults", path) + + if os.path.exists(defaults_path): + return defaults_path + + return path