From bd47210e4243d7ae0b38b24cb87dc303be293b9a Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 14 Sep 2023 11:21:36 -0700
Subject: [PATCH 1/4] Remove unnecessary lambda from inputs/params

Add a note where it should be kept.
---
 workflow/snakemake_rules/core.smk   | 2 +-
 workflow/snakemake_rules/export.smk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk
index 3ff7bea..402203c 100644
--- a/workflow/snakemake_rules/core.smk
+++ b/workflow/snakemake_rules/core.smk
@@ -10,7 +10,7 @@ rule wrangle_metadata:
     output:
         metadata="data/{a_or_b}/metadata_by_accession.tsv",
     params:
-        strain_id=lambda w: config.get("strain_id_field", "strain"),
+        strain_id=config.get("strain_id_field", "strain"),
     shell:
         """
         python3 scripts/wrangle_metadata.py --metadata {input.metadata} \
diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk
index b8ac05b..eccc563 100644
--- a/workflow/snakemake_rules/export.smk
+++ b/workflow/snakemake_rules/export.smk
@@ -62,7 +62,7 @@ rule final_strain_name:
     output:
         auspice_json=build_dir + "/{a_or_b}/{build_name}/tree_renamed.json"
     params:
-        display_strain_field=lambda w: config.get("display_strain_field", "strain"),
+        display_strain_field=config.get("display_strain_field", "strain"),
     shell:
         """
         python3 scripts/set_final_strain_name.py --metadata {input.metadata} \

From 10bca6047a19372a9cad6e409cda041e2dd4d35b Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 14 Sep 2023 11:25:41 -0700
Subject: [PATCH 2/4] Standardize usage of Snakemake's "config" variable

Checking presence of a key in the config should be done with the
membership operator.
---
 Snakefile        | 2 +-
 ingest/Snakefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0430647..cdd77ab 100644
--- a/Snakefile
+++ b/Snakefile
@@ -25,7 +25,7 @@ include: "workflow/snakemake_rules/glycosylation.smk"
 include: "workflow/snakemake_rules/clades.smk"
 
 
-if config.get("deploy_url"):
+if "deploy_url" in config:
     include: "workflow/snakemake_rules/nextstrain_automation.smk"
 
 rule clean:
diff --git a/ingest/Snakefile b/ingest/Snakefile
index 63d6972..6ebe391 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -25,7 +25,7 @@ def _get_all_targets(wildcards):
             )
         elif len(remote_file_names) != len(set(remote_file_names)):
             print(f"Skipping file upload for {target!r} because there are duplicate remote file names.")
-        elif not config.get("s3_dst"):
+        elif "s3_dst" not in config:
             print(f"Skipping file upload for {target!r} because the destintion was not defined.")
         else:
             all_targets.extend(

From 2c192fa6184da971c91d4a7b69a931fbe0978cfa Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 14 Sep 2023 11:49:56 -0700
Subject: [PATCH 3/4] Make the choice of "accession" on the config level

Don't set defaults when retrieving strain_id_field so "accession" is
only set on the config level.
---
 workflow/snakemake_rules/core.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk
index 402203c..3f85fdd 100644
--- a/workflow/snakemake_rules/core.smk
+++ b/workflow/snakemake_rules/core.smk
@@ -10,7 +10,7 @@ rule wrangle_metadata:
     output:
         metadata="data/{a_or_b}/metadata_by_accession.tsv",
     params:
-        strain_id=config.get("strain_id_field", "strain"),
+        strain_id=config["strain_id_field"],
     shell:
         """
         python3 scripts/wrangle_metadata.py --metadata {input.metadata} \

From d123d607d4fc636f85f2463814ca1d9ee66bd953 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Mon, 17 Jul 2023 15:37:34 -0700
Subject: [PATCH 4/4] Use "accession" column as ID column directly

New options in Augur 22.2.0 allow usage of this column as the ID column
across all subcommands that read metadata.

For this workflow in particular, the metadata file can now be used
as-is. This removes the need for a modified copy of the metadata. It
also allows specifying an original metadata column "strain" as the
display strain field, rather than a column "strain_original" generated
during the Snakemake workflow.
---
 config/configfile.yaml              |  2 +-
 scripts/set_final_strain_name.py    |  6 ++++--
 scripts/wrangle_metadata.py         | 25 -------------------------
 workflow/envs/nextstrain.yaml       |  2 +-
 workflow/snakemake_rules/core.smk   | 28 ++++++++++------------------
 workflow/snakemake_rules/export.smk |  6 +++++-
 6 files changed, 21 insertions(+), 48 deletions(-)
 delete mode 100644 scripts/wrangle_metadata.py

diff --git a/config/configfile.yaml b/config/configfile.yaml
index 7122f48..1d2e7cb 100644
--- a/config/configfile.yaml
+++ b/config/configfile.yaml
@@ -9,7 +9,7 @@ exclude: "config/outliers.txt"
 description: "config/description.md"
 
 strain_id_field: "accession"
-display_strain_field: "strain_original"
+display_strain_field: "strain"
 
 subtypes: ['a', 'b']
 
diff --git a/scripts/set_final_strain_name.py b/scripts/set_final_strain_name.py
index 2b74f27..001d759 100644
--- a/scripts/set_final_strain_name.py
+++ b/scripts/set_final_strain_name.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import json, argparse
+from augur.io import read_metadata
 
 def replace_name_recursive(node, lookup):
     if node["name"] in lookup:
@@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup):
 
     parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
     parser.add_argument('--metadata', type=str, required=True, help="input data")
+    parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
     parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
     parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
     args = parser.parse_args()
 
-    metadata = pd.read_csv(args.metadata, sep='\t')
+    metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
     name_lookup = {}
     for ri, row in metadata.iterrows():
-        strain_id = row['strain']
+        strain_id = row.name
         name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]
 
     with open(args.input_auspice_json, 'r') as fh:
diff --git a/scripts/wrangle_metadata.py b/scripts/wrangle_metadata.py
deleted file mode 100644
index dc915ea..0000000
--- a/scripts/wrangle_metadata.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import pandas as pd
-import argparse
-
-if __name__=="__main__":
-    parser = argparse.ArgumentParser(
-        description="remove time info",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument('--metadata', type=str, required=True, help="input data")
-    parser.add_argument('--strain-id', type=str, required=True, help="field to use as strain id")
-    parser.add_argument('--output', type=str, required=True, help="output metadata")
-    args = parser.parse_args()
-
-    metadata = pd.read_csv(args.metadata, sep='\t')
-    if 'strain' in metadata.columns:
-        metadata.rename(columns={'strain': 'strain_original'}, inplace=True)
-
-    # copy column, retaining original
-    # ie keep "accession" but also include "strain" with data from previous "accession" column
-    # insert this as the first column in the dataframe
-    metadata.insert(0, "strain", metadata[args.strain_id])
-    # metadata["strain"] = metadata[args.strain_id]
-
-    metadata.to_csv(args.output, sep='\t', index=False)
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
index 6f8b609..a820606 100644
--- a/workflow/envs/nextstrain.yaml
+++ b/workflow/envs/nextstrain.yaml
@@ -19,6 +19,6 @@ dependencies:
   - pip
   - pip:
       - awscli==1.18.45
-      - nextstrain-augur==11.3.0
+      - nextstrain-augur==22.2.0
       - nextstrain-cli==1.16.5
       - rethinkdb==2.3.0.post6
diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk
index 3f85fdd..4d91e50 100644
--- a/workflow/snakemake_rules/core.smk
+++ b/workflow/snakemake_rules/core.smk
@@ -4,20 +4,6 @@ This part of the workflow expects input files
             metadata = "data/metadata.tsv"
 '''
 
-rule wrangle_metadata:
-    input:
-        metadata="data/{a_or_b}/metadata.tsv",
-    output:
-        metadata="data/{a_or_b}/metadata_by_accession.tsv",
-    params:
-        strain_id=config["strain_id_field"],
-    shell:
-        """
-        python3 scripts/wrangle_metadata.py --metadata {input.metadata} \
-                    --strain-id {params.strain_id} \
-                    --output {output.metadata}
-        """
-
 rule index_sequences:
     message:
         """
@@ -64,7 +50,7 @@ rule filter:
     input:
         sequences = "data/{a_or_b}/sequences.fasta",
         reference = "config/{a_or_b}reference.gbk",
-        metadata = "data/{a_or_b}/metadata_by_accession.tsv",
+        metadata = "data/{a_or_b}/metadata.tsv",
         sequence_index = rules.index_sequences.output,
         exclude = config['exclude']
     output:
@@ -72,13 +58,15 @@ rule filter:
     params:
     	group_by = config["filter"]["group_by"],
     	min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
-    	subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000)
+    	subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
+        strain_id=config["strain_id_field"],
     shell:
         """
         augur filter \
             --sequences {input.sequences} \
             --sequence-index {input.sequence_index} \
             --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
             --exclude {input.exclude} \
             --output {output.sequences} \
             --group-by {params.group_by} \
@@ -196,13 +184,15 @@ rule refine:
     params:
     	coalescent = config["refine"]["coalescent"],
     	clock_filter_iqd = config["refine"]["clock_filter_iqd"],
-    	date_inference = config["refine"]["date_inference"]
+    	date_inference = config["refine"]["date_inference"],
+        strain_id=config["strain_id_field"],
     shell:
         """
         augur refine \
             --tree {input.tree} \
             --alignment {input.alignment} \
             --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
             --output-tree {output.tree} \
             --output-node-data {output.node_data} \
             --coalescent {params.coalescent} \
@@ -264,12 +254,14 @@ rule traits:
     log:
         "logs/{a_or_b}/traits_{build_name}_rsv.txt"
     params:
-    	columns = config["traits"]["columns"]
+    	columns = config["traits"]["columns"],
+        strain_id=config["strain_id_field"],
     shell:
         """
         augur traits \
             --tree {input.tree} \
             --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
             --output {output.node_data} \
             --columns {params.columns} \
             --confidence
diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk
index eccc563..acbfdea 100644
--- a/workflow/snakemake_rules/export.smk
+++ b/workflow/snakemake_rules/export.smk
@@ -40,12 +40,14 @@ rule export:
         auspice_json =  build_dir + "/{a_or_b}/{build_name}/tree.json",
         root_sequence = build_dir + "/{a_or_b}/{build_name}/tree_root-sequence.json"
     params:
-    	title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny"
+    	title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny",
+        strain_id=config["strain_id_field"],
     shell:
         """
         augur export v2 \
             --tree {input.tree} \
             --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
             --node-data {input.node_data} \
             --title {params.title:q} \
             --description {input.description} \
@@ -62,10 +64,12 @@ rule final_strain_name:
     output:
         auspice_json=build_dir + "/{a_or_b}/{build_name}/tree_renamed.json"
     params:
+        strain_id=config["strain_id_field"],
         display_strain_field=config.get("display_strain_field", "strain"),
     shell:
         """
         python3 scripts/set_final_strain_name.py --metadata {input.metadata} \
+                --metadata-id-columns {params.strain_id} \
                 --input-auspice-json {input.auspice_json} \
                 --display-strain-name {params.display_strain_field} \
                 --output {output.auspice_json}