From 6584e68622dd305592b98d8630fa94e191b4f164 Mon Sep 17 00:00:00 2001
From: Thomas Sibley <tsibley@fredhutch.org>
Date: Wed, 30 Oct 2024 12:50:37 -0700
Subject: [PATCH] wip! Introduce workdir portability to all workflows

XXX FIXME: rationale, relationship to "workflows as programs"
XXX FIXME: alternatives considered (but declined) for path-in-config-value handling
XXX FIXME: document layout/structure of workdir
---
 ingest/Snakefile                              | 20 ++++++++++--
 ingest/defaults/config.yaml                   | 12 ++++---
 ingest/rules/curate.smk                       | 18 +++++------
 nextclade/Snakefile                           |  9 ++++--
 nextclade/defaults/config.yaml                | 14 ++++-----
 nextclade/rules/annotate_phylogeny.smk        |  6 ++--
 nextclade/rules/export.smk                    |  4 +--
 nextclade/rules/prepare_sequences.smk         |  6 ++--
 phylogenetic/Snakefile                        |  9 ++++--
 phylogenetic/defaults/config.yaml             | 20 ++++++------
 phylogenetic/rules/annotate_phylogeny.smk     |  2 +-
 phylogenetic/rules/export.smk                 |  6 ++--
 phylogenetic/rules/prepare_sequences.smk      |  6 ++--
 phylogenetic/rules/prepare_sequences_N450.smk |  6 ++--
 shared/README.md                              |  6 ++++
 shared/functions.smk                          | 31 +++++++++++++++++++
 16 files changed, 119 insertions(+), 56 deletions(-)
 create mode 100644 shared/README.md
 create mode 100644 shared/functions.smk

diff --git a/ingest/Snakefile b/ingest/Snakefile
index c7d297d..056f1d5 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -2,9 +2,17 @@
 This is the main ingest Snakefile that orchestrates the full ingest workflow
 and defines its default outputs.
 """
+# Utility functions shared across all workflows.
+include: "../shared/functions.smk"
+
+
+# Use default configuration values. Extend with Snakemake's --configfile/--config options.
+configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
+
+# Use custom configuration from analysis directory (i.e. working dir), if any.
+if os.path.exists("config.yaml"):
+    configfile: "config.yaml"
 
-# Use default configuration values. Override with Snakemake's --configfile/--config options.
-configfile: "defaults/config.yaml"
 
 # This is the default rule that Snakemake will run when there are no specified targets.
 # The default output of the ingest workflow is usually the curated metadata and sequences.
@@ -42,4 +50,10 @@ include: "rules/nextclade.smk"
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
+        # Relative custom rule paths in the config are relative to the analysis
+        # directory (i.e. the current working directory, or workdir, usually
+        # given by --directory), but the "include" directive treats relative
+        # paths as relative to the workflow (e.g. workflow.current_basedir).
+        # Convert to an absolute path based on the analysis/current directory
+        # to avoid this mismatch of expectations.
+        include: os.path.join(os.getcwd(), rule_file)
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index d3d2ca3..504b132 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -37,9 +37,10 @@ curate:
   # For the Nextstrain team, this is currently
   # 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
   geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
-  # The path to the local geolocation rules within the pathogen repo
-  # The path should be relative to the ingest directory.
-  local_geolocation_rules: "defaults/geolocation_rules.tsv"
+  # The path to the local geolocation rules for this pathogen.
+  # The path should be relative to the working directory (e.g. --directory).
+  # If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists).
+  local_geolocation_rules: "geolocation_rules.tsv"
   # List of field names to change where the key is the original field name and the value is the new field name
   # The original field names should match the ncbi_datasets_fields provided above.
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
@@ -91,8 +92,9 @@ curate:
   # Name to use for the generated abbreviated authors field
   abbr_authors_field: "abbr_authors"
   # Path to the manual annotations file
-  # The path should be relative to the ingest directory
-  annotations: "defaults/annotations.tsv"
+  # The path should be relative to the working directory (e.g. --directory).
+  # If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists).
+  annotations: "annotations.tsv"
   # The ID field in the metadata to use to merge the manual annotations
   annotations_id: "accession"
   # The ID field in the metadata to use as the sequence id in the output FASTA file
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
index 2bfeb9d..0c30a1d 100644
--- a/ingest/rules/curate.smk
+++ b/ingest/rules/curate.smk
@@ -31,7 +31,7 @@ rule fetch_general_geolocation_rules:
 rule concat_geolocation_rules:
     input:
         general_geolocation_rules="data/general-geolocation-rules.tsv",
-        local_geolocation_rules=config["curate"]["local_geolocation_rules"],
+        local_geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]),
     output:
         all_geolocation_rules="data/all-geolocation-rules.tsv",
     shell:
@@ -59,7 +59,7 @@ rule curate:
         sequences_ndjson="data/ncbi.ndjson",
         # Change the geolocation_rules input path if you are removing the above two rules
         all_geolocation_rules="data/all-geolocation-rules.tsv",
-        annotations=config["curate"]["annotations"],
+        annotations=resolve_config_path(config["curate"]["annotations"]),
     output:
         metadata="data/all_metadata.tsv",
         sequences="results/sequences.fasta",
@@ -86,28 +86,28 @@ rule curate:
     shell:
         """
         (cat {input.sequences_ndjson} \
-            | ./vendored/transform-field-names \
+            | {workflow.basedir}/vendored/transform-field-names \
                 --field-map {params.field_map} \
             | augur curate normalize-strings \
-            | ./vendored/transform-strain-names \
+            | {workflow.basedir}/vendored/transform-strain-names \
                 --strain-regex {params.strain_regex} \
                 --backup-fields {params.strain_backup_fields} \
             | augur curate format-dates \
                 --date-fields {params.date_fields} \
                 --expected-date-formats {params.expected_date_formats} \
-            | ./vendored/transform-genbank-location \
+            | {workflow.basedir}/vendored/transform-genbank-location \
             | augur curate titlecase \
                 --titlecase-fields {params.titlecase_fields} \
                 --articles {params.articles} \
                 --abbreviations {params.abbreviations} \
-            | ./vendored/transform-authors \
+            | {workflow.basedir}/vendored/transform-authors \
                 --authors-field {params.authors_field} \
                 --default-value {params.authors_default_value} \
                 --abbr-authors-field {params.abbr_authors_field} \
-            | ./vendored/apply-geolocation-rules \
+            | {workflow.basedir}/vendored/apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
-            | ./bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
-            | ./vendored/merge-user-metadata \
+            | {workflow.basedir}/bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
+            | {workflow.basedir}/vendored/merge-user-metadata \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \
             | augur curate passthru \
diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index dd876ff..8c2f20a 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -1,4 +1,9 @@
-configfile: "defaults/config.yaml" 
+include: "../shared/functions.smk"
+
+configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
+
+if os.path.exists("config.yaml"):
+    configfile: "config.yaml"
 
 rule all:
     input:
@@ -13,7 +18,7 @@ include: "rules/export.smk"
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
+        include: os.path.join(os.getcwd(), rule_file)
 
 rule clean:
     """Removing directories: {params}"""
diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml
index 41895a9..8964c9d 100644
--- a/nextclade/defaults/config.yaml
+++ b/nextclade/defaults/config.yaml
@@ -1,12 +1,12 @@
 strain_id_field: "accession"
 files:
-    exclude: "defaults/dropped_strains.txt"
-    include: "defaults/include_strains.txt"
-    reference_N450: "defaults/measles_reference_N450.gb"
-    reference_N450_fasta: "defaults/measles_reference_N450.fasta"
-    clades: "defaults/clades.tsv"
-    colors: "defaults/colors.tsv"
-    auspice_config: "defaults/auspice_config.json"
+    exclude: "dropped_strains.txt"
+    include: "include_strains.txt"
+    reference_N450: "measles_reference_N450.gb"
+    reference_N450_fasta: "measles_reference_N450.fasta"
+    clades: "clades.tsv"
+    colors: "colors.tsv"
+    auspice_config: "auspice_config.json"
 align_and_extract_N450:
     min_length: 400
     min_seed_cover: 0.01
diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk
index 5726f92..97b1194 100644
--- a/nextclade/rules/annotate_phylogeny.smk
+++ b/nextclade/rules/annotate_phylogeny.smk
@@ -14,7 +14,7 @@ rule ancestral:
         node_data = "results/nt_muts.json"
     params:
         inference = config["ancestral"]["inference"],
-        reference_fasta = config["files"]["reference_N450_fasta"]
+        reference_fasta = resolve_config_path(config["files"]["reference_N450_fasta"])
     shell:
         """
         augur ancestral \
@@ -30,7 +30,7 @@ rule translate:
     input:
         tree = "results/tree.nwk",
         node_data = "results/nt_muts.json",
-        reference = config["files"]["reference_N450"]
+        reference = resolve_config_path(config["files"]["reference_N450"])
     output:
         node_data = "results/aa_muts.json"
     shell:
@@ -47,7 +47,7 @@ rule clades:
         tree = "results/tree.nwk",
         nt_muts = "results/nt_muts.json",
         aa_muts = "results/aa_muts.json",
-        clade_defs = config["files"]["clades"]
+        clade_defs = resolve_config_path(config["files"]["clades"])
     output:
         clades = "results/clades.json"
     shell:
diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk
index bc08b8c..18a3923 100644
--- a/nextclade/rules/export.smk
+++ b/nextclade/rules/export.smk
@@ -14,8 +14,8 @@ rule export:
         clades = "results/clades.json",
         nt_muts = "results/nt_muts.json",
         aa_muts = "results/aa_muts.json",
-        colors = config["files"]["colors"],
-        auspice_config = config["files"]["auspice_config"]
+        colors = resolve_config_path(config["files"]["colors"]),
+        auspice_config = resolve_config_path(config["files"]["auspice_config"])
     output:
         auspice_json = "auspice/measles.json"
     params:
diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk
index 428a0de..5ab7dc3 100644
--- a/nextclade/rules/prepare_sequences.smk
+++ b/nextclade/rules/prepare_sequences.smk
@@ -34,7 +34,7 @@ rule decompress:
 rule align_and_extract_N450:
     input:
         sequences = "data/sequences.fasta",
-        reference = config["files"]["reference_N450_fasta"]
+        reference = resolve_config_path(config["files"]["reference_N450_fasta"])
     output:
         sequences = "results/sequences_N450.fasta"
     params:
@@ -57,8 +57,8 @@ rule filter:
     input:
         sequences = "results/sequences_N450.fasta",
         metadata = "data/metadata.tsv",
-        exclude = config["files"]["exclude"],
-        include = config["files"]["include"]
+        exclude = resolve_config_path(config["files"]["exclude"]),
+        include = resolve_config_path(config["files"]["include"])
     output:
         sequences = "results/aligned.fasta"
     params:
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 0acf1a7..937b4b8 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -1,6 +1,11 @@
 genes = ['N450', 'genome']
 
-configfile: "defaults/config.yaml"
+include: "../shared/functions.smk"
+
+configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
+
+if os.path.exists("config.yaml"):
+    configfile: "config.yaml"
 
 rule all:
     input:
@@ -17,7 +22,7 @@ include: "rules/export.smk"
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
+        include: os.path.join(os.getcwd(), rule_file)
 
 rule clean:
     """Removing directories: {params}"""
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
index 01ff2ba..eda6802 100644
--- a/phylogenetic/defaults/config.yaml
+++ b/phylogenetic/defaults/config.yaml
@@ -1,15 +1,15 @@
 strain_id_field: "accession"
 files:
-    exclude: "defaults/dropped_strains.txt"
-    include_genome: "defaults/include_strains_genome.txt"
-    include_N450: "defaults/include_strains_N450.txt"
-    reference: "defaults/measles_reference.gb"
-    reference_N450: "defaults/measles_reference_N450.gb"
-    reference_N450_fasta: "defaults/measles_reference_N450.fasta"
-    colors: "defaults/colors.tsv"
-    auspice_config: "defaults/auspice_config.json"
-    auspice_config_N450: "defaults/auspice_config_N450.json"
-    description: "defaults/description.md"
+    exclude: "dropped_strains.txt"
+    include_genome: "include_strains_genome.txt"
+    include_N450: "include_strains_N450.txt"
+    reference: "measles_reference.gb"
+    reference_N450: "measles_reference_N450.gb"
+    reference_N450_fasta: "measles_reference_N450.fasta"
+    colors: "colors.tsv"
+    auspice_config: "auspice_config.json"
+    auspice_config_N450: "auspice_config_N450.json"
+    description: "description.md"
 filter:
     group_by: "country year"
     sequences_per_group: 20
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
index ea8f4e1..e27958a 100644
--- a/phylogenetic/rules/annotate_phylogeny.smk
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -28,7 +28,7 @@ rule translate:
     input:
         tree = "results/{gene}/tree.nwk",
         node_data = "results/{gene}/nt_muts.json",
-        reference = lambda wildcard: config["files"]["reference" if wildcard.gene == "genome" else f"reference_{wildcard.gene}"]
+        reference = lambda wildcard: resolve_config_path(config["files"]["reference" if wildcard.gene == "genome" else f"reference_{wildcard.gene}"])
     output:
         node_data = "results/{gene}/aa_muts.json"
     shell:
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index 53ca492..5586153 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -13,9 +13,9 @@ rule export:
         branch_lengths = "results/{gene}/branch_lengths.json",
         nt_muts = "results/{gene}/nt_muts.json",
         aa_muts = "results/{gene}/aa_muts.json",
-        colors = config["files"]["colors"],
-        auspice_config = lambda wildcard: config["files"]["auspice_config" if wildcard.gene == "genome" else f"auspice_config_{wildcard.gene}"],
-        description=config["files"]["description"]
+        colors = resolve_config_path(config["files"]["colors"]),
+        auspice_config = lambda wildcard: resolve_config_path(config["files"]["auspice_config" if wildcard.gene == "genome" else f"auspice_config_{wildcard.gene}"]),
+        description=resolve_config_path(config["files"]["description"])
     output:
         auspice_json = "auspice/measles_{gene}.json"
     params:
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index 6d5bf85..730ff09 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -42,8 +42,8 @@ rule filter:
     input:
         sequences = "data/sequences.fasta",
         metadata = "data/metadata.tsv",
-        exclude = config["files"]["exclude"],
-        include = config["files"]["include_genome"]
+        exclude = resolve_config_path(config["files"]["exclude"]),
+        include = resolve_config_path(config["files"]["include_genome"])
     output:
         sequences = "results/genome/filtered.fasta"
     params:
@@ -74,7 +74,7 @@ rule align:
     """
     input:
         sequences = "results/genome/filtered.fasta",
-        reference = config["files"]["reference"]
+        reference = resolve_config_path(config["files"]["reference"])
     output:
         alignment = "results/genome/aligned.fasta"
     shell:
diff --git a/phylogenetic/rules/prepare_sequences_N450.smk b/phylogenetic/rules/prepare_sequences_N450.smk
index d009c25..dc211e4 100644
--- a/phylogenetic/rules/prepare_sequences_N450.smk
+++ b/phylogenetic/rules/prepare_sequences_N450.smk
@@ -7,7 +7,7 @@ See Augur's usage docs for these commands for more details.
 rule align_and_extract_N450:
     input:
         sequences = "data/sequences.fasta",
-        reference = config["files"]["reference_N450_fasta"]
+        reference = resolve_config_path(config["files"]["reference_N450_fasta"])
     output:
         sequences = "results/N450/sequences.fasta"
     params:
@@ -34,8 +34,8 @@ rule filter_N450:
     input:
         sequences = "results/N450/sequences.fasta",
         metadata = "data/metadata.tsv",
-        exclude = config["files"]["exclude"],
-        include = config["files"]["include_N450"]
+        exclude = resolve_config_path(config["files"]["exclude"]),
+        include = resolve_config_path(config["files"]["include_N450"])
     output:
         sequences = "results/N450/aligned.fasta"
     params:
diff --git a/shared/README.md b/shared/README.md
new file mode 100644
index 0000000..8e4c2d0
--- /dev/null
+++ b/shared/README.md
@@ -0,0 +1,6 @@
+# Shared
+
+> **Warning**
+> Please be aware of the multiple workflows that will be affected when editing files in this directory!
+
+This directory that holds files that are shared across multiple workflows.
diff --git a/shared/functions.smk b/shared/functions.smk
new file mode 100644
index 0000000..3b2b8b8
--- /dev/null
+++ b/shared/functions.smk
@@ -0,0 +1,31 @@
+import os.path
+
+def resolve_config_path(path):
+    """
+    Resolve a relative *path* given in a configuration value.
+
+    Resolves *path* as relative to the workflow's ``defaults/`` directory (i.e.
+    ``os.path.join(workflow.basedir, "defaults", path)``) if it doesn't exist
+    in the workflow's analysis directory (i.e. the current working
+    directory, or workdir, usually given by ``--directory`` (``-d``)).
+
+    This behaviour allows a default configuration value to point to a default
+    auxiliary file while also letting the file used be overridden either by
+    setting an alternate file path in the configuration or by creating a file
+    with the conventional name in the workflow's analysis directory.
+    """
+    global workflow
+
+    if not os.path.exists(path):
+        # Special-case defaults/… for backwards compatibility with older
+        # configs.  We could achieve the same behaviour with a symlink
+        # (defaults/defaults → .) but that seems less clear.
+        if path.startswith("defaults/"):
+            defaults_path = os.path.join(workflow.basedir, path)
+        else:
+            defaults_path = os.path.join(workflow.basedir, "defaults", path)
+
+        if os.path.exists(defaults_path):
+            return defaults_path
+
+    return path