diff --git a/config/configfile.yaml b/config/configfile.yaml index 7122f48..1d2e7cb 100644 --- a/config/configfile.yaml +++ b/config/configfile.yaml @@ -9,7 +9,7 @@ exclude: "config/outliers.txt" description: "config/description.md" strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" subtypes: ['a', 'b'] diff --git a/scripts/set_final_strain_name.py b/scripts/set_final_strain_name.py index 2b74f27..001d759 100644 --- a/scripts/set_final_strain_name.py +++ b/scripts/set_final_strain_name.py @@ -1,5 +1,6 @@ import pandas as pd import json, argparse +from augur.io import read_metadata def replace_name_recursive(node, lookup): if node["name"] in lookup: @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup): parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json") parser.add_argument('--metadata', type=str, required=True, help="input data") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice") parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") args = parser.parse_args() - metadata = pd.read_csv(args.metadata, sep='\t') + metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns) name_lookup = {} for ri, row in metadata.iterrows(): - strain_id = row['strain'] + strain_id = row.name name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name] with open(args.input_auspice_json, 'r') as fh: diff --git a/scripts/wrangle_metadata.py b/scripts/wrangle_metadata.py deleted file mode 100644 index dc915ea..0000000 --- a/scripts/wrangle_metadata.py +++ /dev/null @@ -1,25 +0,0 @@ -import pandas as pd -import argparse - -if __name__=="__main__": - parser = argparse.ArgumentParser( - description="remove time info", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--metadata', type=str, required=True, help="input data") - parser.add_argument('--strain-id', type=str, required=True, help="field to use as strain id") - parser.add_argument('--output', type=str, required=True, help="output metadata") - args = parser.parse_args() - - metadata = pd.read_csv(args.metadata, sep='\t') - if 'strain' in metadata.columns: - metadata.rename(columns={'strain': 'strain_original'}, inplace=True) - - # copy column, retaining original - # ie keep "accession" but also include "strain" with data from previous "accession" column - # insert this as the first column in the dataframe - metadata.insert(0, "strain", metadata[args.strain_id]) - # metadata["strain"] = metadata[args.strain_id] - - metadata.to_csv(args.output, sep='\t', index=False) diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml index 6f8b609..a820606 100644 --- a/workflow/envs/nextstrain.yaml +++ b/workflow/envs/nextstrain.yaml @@ -19,6 +19,6 @@ dependencies: - pip - pip: - awscli==1.18.45 - - nextstrain-augur==11.3.0 + - nextstrain-augur==22.2.0 - nextstrain-cli==1.16.5 - rethinkdb==2.3.0.post6 diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 3f85fdd..4d91e50 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -4,20 +4,6 @@ This part of the workflow expects input files metadata = "data/metadata.tsv" ''' -rule wrangle_metadata: - input: - metadata="data/{a_or_b}/metadata.tsv", - output: - metadata="data/{a_or_b}/metadata_by_accession.tsv", - params: - strain_id=config["strain_id_field"], - shell: - """ - python3 scripts/wrangle_metadata.py --metadata {input.metadata} \ - --strain-id {params.strain_id} \ - --output {output.metadata} - """ - rule index_sequences: message: """ @@ -64,7 +50,7 @@ rule filter: input: sequences = "data/{a_or_b}/sequences.fasta", reference = "config/{a_or_b}reference.gbk", - metadata = "data/{a_or_b}/metadata_by_accession.tsv", + metadata = "data/{a_or_b}/metadata.tsv", sequence_index = rules.index_sequences.output, exclude = config['exclude'] output: @@ -72,13 +58,15 @@ rule filter: params: group_by = config["filter"]["group_by"], min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}', - subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000) + subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000), + strain_id=config["strain_id_field"], shell: """ augur filter \ --sequences {input.sequences} \ --sequence-index {input.sequence_index} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --exclude {input.exclude} \ --output {output.sequences} \ --group-by {params.group_by} \ @@ -196,13 +184,15 @@ rule refine: params: coalescent = config["refine"]["coalescent"], clock_filter_iqd = config["refine"]["clock_filter_iqd"], - date_inference = config["refine"]["date_inference"] + date_inference = config["refine"]["date_inference"], + strain_id=config["strain_id_field"], shell: """ augur refine \ --tree {input.tree} \ --alignment {input.alignment} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output-tree {output.tree} \ --output-node-data {output.node_data} \ --coalescent {params.coalescent} \ @@ -264,12 +254,14 @@ rule traits: log: "logs/{a_or_b}/traits_{build_name}_rsv.txt" params: - columns = config["traits"]["columns"] + columns = config["traits"]["columns"], + strain_id=config["strain_id_field"], shell: """ augur traits \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output {output.node_data} \ --columns {params.columns} \ --confidence diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index eccc563..acbfdea 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -40,12 +40,14 @@ rule export: auspice_json = build_dir + "/{a_or_b}/{build_name}/tree.json", root_sequence = build_dir + "/{a_or_b}/{build_name}/tree_root-sequence.json" params: - title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny" + title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny", + strain_id=config["strain_id_field"], shell: """ augur export v2 \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --node-data {input.node_data} \ --title {params.title:q} \ --description {input.description} \ @@ -62,10 +64,12 @@ rule final_strain_name: output: auspice_json=build_dir + "/{a_or_b}/{build_name}/tree_renamed.json" params: + strain_id=config["strain_id_field"], display_strain_field=config.get("display_strain_field", "strain"), shell: """ python3 scripts/set_final_strain_name.py --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --input-auspice-json {input.auspice_json} \ --display-strain-name {params.display_strain_field} \ --output {output.auspice_json}