diff --git a/phylogenetic/build-configs/inrb/auspice_config.json b/phylogenetic/build-configs/inrb/auspice_config.json index 4c711e6..5f5c796 100644 --- a/phylogenetic/build-configs/inrb/auspice_config.json +++ b/phylogenetic/build-configs/inrb/auspice_config.json @@ -15,6 +15,15 @@ ], "build_url": "https://github.com/nextstrain/mpox", "colorings": [ + { + "key": "clade_membership", + "title": "Clade", + "type": "categorical", + "scale": [ + ["Ia", "#4C90C0"], + ["Ib", "#CBB742"] + ] + }, { "key": "region", "title": "Region", diff --git a/phylogenetic/build-configs/inrb/config.yaml b/phylogenetic/build-configs/inrb/config.yaml index 91513fe..e8e958f 100644 --- a/phylogenetic/build-configs/inrb/config.yaml +++ b/phylogenetic/build-configs/inrb/config.yaml @@ -13,3 +13,9 @@ private_metadata: "data/metadata-private.tsv" traits: columns: region country division location sampling_bias_correction: 3 + +# Private INRB data doesn't have clade annotations so allow empty clade fields +# (i.e. we're assuming all INRB data is clade I) +subsample: + everything: + query: "'clade in [\"I\", \"Ia\", \"Ib\", \"\"]'" diff --git a/phylogenetic/build-configs/inrb/curate_private_data.py b/phylogenetic/build-configs/inrb/curate_private_data.py index d4febcf..9c8a869 100644 --- a/phylogenetic/build-configs/inrb/curate_private_data.py +++ b/phylogenetic/build-configs/inrb/curate_private_data.py @@ -5,6 +5,7 @@ from datetime import datetime from os import path, mkdir from sys import exit +import csv Sequences = dict[str, SeqIO.SeqRecord] Metadata = dict[str, dict[str, str]] @@ -238,10 +239,11 @@ def write_sequences(sequences: Sequences) -> None: def write_metadata(metadata: Metadata, header: MetadataHeader) -> None: fname = fname_in_data_dir("metadata-private.tsv") print(f"Writing metadata to {fname}") - with open(fname, "w") as fh: - print("\t".join(header), file=fh) - for _, value in metadata.items(): - print("\t".join([value[field] for field in header]), file=fh) + with open(fname, "w", newline='') as fh: + writer = csv.DictWriter(fh, header, extrasaction='ignore', delimiter='\t', lineterminator='\n') + writer.writeheader() + for line in metadata.values(): + writer.writerow(line) def parse_remap_columns(arg: list[str]) -> list[tuple[str, str]]: try: diff --git a/phylogenetic/scripts/assign-clades-via-metadata.py b/phylogenetic/scripts/assign-clades-via-metadata.py index bc8738e..ef467ce 100644 --- a/phylogenetic/scripts/assign-clades-via-metadata.py +++ b/phylogenetic/scripts/assign-clades-via-metadata.py @@ -1,24 +1,30 @@ """ -Use provided metadata to assign clade to terminal nodes. If the metadata is complete (i.e. all tips are assigned a clade) -then for each clade we assign the clade to internal nodes and set a label for the MRCA node, iff the tips are monophyletic. +Use provided metadata to assign clades to internal nodes and those with missing metadata. +For each valid clades (i.e. those which match a hardcoded list) as long as the tips are monophyletic +in the tree (ignoring missing data) then we label all internal nodes and missing tips with that clade +as well as labelling the MRCA branch. """ import argparse -from sys import stderr, stdout, exit +from sys import exit from Bio import Phylo import json from augur.io import read_metadata from augur.argparse_ import ExtendOverwriteDefault from collections import defaultdict +VALID_CLADES = set(['Ia', 'Ib', 'I']) +MISSING = '' -def assign_internal_nodes(clade_name, terminal_nodes, node_data): - print(f"Assigning {clade_name} to internal nodes") - mrca = t.is_monophyletic(terminal_nodes) - if not mrca: - print(f"WARNING: {clade_name} wasn't monophyletic! Clades will not be assigned to internal nodes") +def assign_internal_nodes(t, clade_name, terminal_nodes, missing_nodes, node_data): + print(f"[clade metadata] Assigning {clade_name} to internal nodes & labelling MRCA branch") + mrca = t.common_ancestor(terminal_nodes) + + if not all([(n in terminal_nodes or n in missing_nodes) for n in mrca.get_terminals()]): + print(f"[clade metadata] ERROR {clade_name} wasn't monophyloetic (after accounting for nodes missing clade values). Clades will not be assigned to internal nodes.") return - for node in mrca.get_nonterminals(): + + for node in (n for n in mrca.find_clades() if n not in terminal_nodes): # skip ones we have already assigned node_data['nodes'][node.name] = {'clade_membership': clade_name} node_data['branches'][mrca.name] = {'labels': {'clade': clade_name}} @@ -41,23 +47,29 @@ def assign_internal_nodes(clade_name, terminal_nodes, node_data): t = Phylo.read(args.tree, "newick") node_data = {'nodes': {}, 'branches': {}} - counts = [0,0] + counts = {'missing': 0, 'valid': 0, 'invalid': 0} terminals = defaultdict(list) + missing_nodes = [] + + print(f"[clade metadata] Pass 1/2 - assigning clades from metadata to nodes") for node in t.get_terminals(): - counts[0]+=1 - if node.name in clades: - counts[1]+=1 - node_data['nodes'][node.name] = {'clade_membership': clades[node.name]} - terminals[clades[node.name]].append(node) - - print(f"Metadata defined clades for {counts[1]}/{counts[0]} tips in tree", file=stderr) - - if counts[0]==counts[1]: - for clade_name, clade_terminals in terminals.items(): - assign_internal_nodes(clade_name, clade_terminals, node_data) - else: - print(f"WARNING: incomplete metadata. Assignment of clade labels is uncertain and thus we are skipping it") + clade_value = clades[node.name] + if clade_value == MISSING: + counts['missing']+=1 + missing_nodes.append(node) + elif clade_value in VALID_CLADES: + counts['valid']+=1 + node_data['nodes'][node.name] = {'clade_membership': clade_value} + terminals[clade_value].append(node) + else: + counts['invalid']+=1 + print(f"[clade metadata] {sum(counts.values())} tips: {counts['valid']} valid, {counts['invalid']} invalid, {counts['missing']} missing clade values") + + + print(f"[clade metadata] Pass 2/2 - if (valid) clades are monophyletic then label internal nodes (and missing tips)") + for clade_name, clade_terminals in terminals.items(): + assign_internal_nodes(t, clade_name, clade_terminals, missing_nodes, node_data) with open(args.output_node_data, 'w') as fh: json.dump(node_data, fh) diff --git a/phylogenetic/scripts/combine_data_sources.py b/phylogenetic/scripts/combine_data_sources.py index 4ec9d49..667dd7d 100644 --- a/phylogenetic/scripts/combine_data_sources.py +++ b/phylogenetic/scripts/combine_data_sources.py @@ -30,7 +30,7 @@ def parse_tsv(fname: str) -> Metadata: assert list(fname).count('=')<=1, f"Too many '=' characters in argument {fname!r}" if '=' in fname: source_name, fname = fname.split('=') - with open(fname, "r") as fh: + with open(fname, "r", newline='') as fh: reader = csv.DictReader(fh, delimiter="\t") metadata = [row for row in reader] if source_name: @@ -77,10 +77,11 @@ def write_sequences(fname: str, sequences: Sequences) -> None: def write_metadata(fname: str, metadata: Metadata, header: MetadataHeader) -> None: print(f"Writing metadata to {fname}") - with open(fname, "w") as fh: - print("\t".join(header), file=fh) + with open(fname, "w", newline='') as fh: + writer = csv.DictWriter(fh, header, extrasaction='ignore', delimiter='\t', lineterminator='\n') + writer.writeheader() for row in metadata: - print("\t".join([row.get(field, '') for field in header]), file=fh) + writer.writerow(row) if __name__=="__main__": args = parse_args()