From 7be3be25980240bd514cbce953c8fff16cacc552 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 19 Nov 2024 14:41:51 +1300 Subject: [PATCH 1/3] fix TSV quoting --- phylogenetic/build-configs/inrb/curate_private_data.py | 10 ++++++---- phylogenetic/scripts/combine_data_sources.py | 9 +++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/phylogenetic/build-configs/inrb/curate_private_data.py b/phylogenetic/build-configs/inrb/curate_private_data.py index d4febcf..9c8a869 100644 --- a/phylogenetic/build-configs/inrb/curate_private_data.py +++ b/phylogenetic/build-configs/inrb/curate_private_data.py @@ -5,6 +5,7 @@ from datetime import datetime from os import path, mkdir from sys import exit +import csv Sequences = dict[str, SeqIO.SeqRecord] Metadata = dict[str, dict[str, str]] @@ -238,10 +239,11 @@ def write_sequences(sequences: Sequences) -> None: def write_metadata(metadata: Metadata, header: MetadataHeader) -> None: fname = fname_in_data_dir("metadata-private.tsv") print(f"Writing metadata to {fname}") - with open(fname, "w") as fh: - print("\t".join(header), file=fh) - for _, value in metadata.items(): - print("\t".join([value[field] for field in header]), file=fh) + with open(fname, "w", newline='') as fh: + writer = csv.DictWriter(fh, header, extrasaction='ignore', delimiter='\t', lineterminator='\n') + writer.writeheader() + for line in metadata.values(): + writer.writerow(line) def parse_remap_columns(arg: list[str]) -> list[tuple[str, str]]: try: diff --git a/phylogenetic/scripts/combine_data_sources.py b/phylogenetic/scripts/combine_data_sources.py index 4ec9d49..667dd7d 100644 --- a/phylogenetic/scripts/combine_data_sources.py +++ b/phylogenetic/scripts/combine_data_sources.py @@ -30,7 +30,7 @@ def parse_tsv(fname: str) -> Metadata: assert list(fname).count('=')<=1, f"Too many '=' characters in argument {fname!r}" if '=' in fname: source_name, fname = fname.split('=') - with open(fname, "r") as fh: + with open(fname, "r", newline='') as fh: reader = csv.DictReader(fh, delimiter="\t") metadata = [row for row in reader] if source_name: @@ -77,10 +77,11 @@ def write_sequences(fname: str, sequences: Sequences) -> None: def write_metadata(fname: str, metadata: Metadata, header: MetadataHeader) -> None: print(f"Writing metadata to {fname}") - with open(fname, "w") as fh: - print("\t".join(header), file=fh) + with open(fname, "w", newline='') as fh: + writer = csv.DictWriter(fh, header, extrasaction='ignore', delimiter='\t', lineterminator='\n') + writer.writeheader() for row in metadata: - print("\t".join([row.get(field, '') for field in header]), file=fh) + writer.writerow(row) if __name__=="__main__": args = parse_args() From cd9585623f13e32394e56637afe3ee4955685498 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 19 Nov 2024 15:47:32 +1300 Subject: [PATCH 2/3] [INRB] Allow missing clade metadata Private INRB data doesn't have clade annotations so allow empty clade fields (i.e. we're assuming all INRB data is clade I). Modify the clade labelling script to allow missing tips which we assign a clade if the tips assigned to that clade are monophyletic once missing tips are excluded. --- phylogenetic/build-configs/inrb/config.yaml | 6 ++ .../scripts/assign-clades-via-metadata.py | 58 +++++++++++-------- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/phylogenetic/build-configs/inrb/config.yaml b/phylogenetic/build-configs/inrb/config.yaml index 91513fe..e8e958f 100644 --- a/phylogenetic/build-configs/inrb/config.yaml +++ b/phylogenetic/build-configs/inrb/config.yaml @@ -13,3 +13,9 @@ private_metadata: "data/metadata-private.tsv" traits: columns: region country division location sampling_bias_correction: 3 + +# Private INRB data doesn't have clade annotations so allow empty clade fields +# (i.e. we're assuming all INRB data is clade I) +subsample: + everything: + query: "'clade in [\"I\", \"Ia\", \"Ib\", \"\"]'" diff --git a/phylogenetic/scripts/assign-clades-via-metadata.py b/phylogenetic/scripts/assign-clades-via-metadata.py index bc8738e..ef467ce 100644 --- a/phylogenetic/scripts/assign-clades-via-metadata.py +++ b/phylogenetic/scripts/assign-clades-via-metadata.py @@ -1,24 +1,30 @@ """ -Use provided metadata to assign clade to terminal nodes. If the metadata is complete (i.e. all tips are assigned a clade) -then for each clade we assign the clade to internal nodes and set a label for the MRCA node, iff the tips are monophyletic. +Use provided metadata to assign clades to internal nodes and those with missing metadata. +For each valid clades (i.e. those which match a hardcoded list) as long as the tips are monophyletic +in the tree (ignoring missing data) then we label all internal nodes and missing tips with that clade +as well as labelling the MRCA branch. """ import argparse -from sys import stderr, stdout, exit +from sys import exit from Bio import Phylo import json from augur.io import read_metadata from augur.argparse_ import ExtendOverwriteDefault from collections import defaultdict +VALID_CLADES = set(['Ia', 'Ib', 'I']) +MISSING = '' -def assign_internal_nodes(clade_name, terminal_nodes, node_data): - print(f"Assigning {clade_name} to internal nodes") - mrca = t.is_monophyletic(terminal_nodes) - if not mrca: - print(f"WARNING: {clade_name} wasn't monophyletic! Clades will not be assigned to internal nodes") +def assign_internal_nodes(t, clade_name, terminal_nodes, missing_nodes, node_data): + print(f"[clade metadata] Assigning {clade_name} to internal nodes & labelling MRCA branch") + mrca = t.common_ancestor(terminal_nodes) + + if not all([(n in terminal_nodes or n in missing_nodes) for n in mrca.get_terminals()]): + print(f"[clade metadata] ERROR {clade_name} wasn't monophyloetic (after accounting for nodes missing clade values). Clades will not be assigned to internal nodes.") return - for node in mrca.get_nonterminals(): + + for node in (n for n in mrca.find_clades() if n not in terminal_nodes): # skip ones we have already assigned node_data['nodes'][node.name] = {'clade_membership': clade_name} node_data['branches'][mrca.name] = {'labels': {'clade': clade_name}} @@ -41,23 +47,29 @@ def assign_internal_nodes(clade_name, terminal_nodes, node_data): t = Phylo.read(args.tree, "newick") node_data = {'nodes': {}, 'branches': {}} - counts = [0,0] + counts = {'missing': 0, 'valid': 0, 'invalid': 0} terminals = defaultdict(list) + missing_nodes = [] + + print(f"[clade metadata] Pass 1/2 - assigning clades from metadata to nodes") for node in t.get_terminals(): - counts[0]+=1 - if node.name in clades: - counts[1]+=1 - node_data['nodes'][node.name] = {'clade_membership': clades[node.name]} - terminals[clades[node.name]].append(node) - - print(f"Metadata defined clades for {counts[1]}/{counts[0]} tips in tree", file=stderr) - - if counts[0]==counts[1]: - for clade_name, clade_terminals in terminals.items(): - assign_internal_nodes(clade_name, clade_terminals, node_data) - else: - print(f"WARNING: incomplete metadata. Assignment of clade labels is uncertain and thus we are skipping it") + clade_value = clades[node.name] + if clade_value == MISSING: + counts['missing']+=1 + missing_nodes.append(node) + elif clade_value in VALID_CLADES: + counts['valid']+=1 + node_data['nodes'][node.name] = {'clade_membership': clade_value} + terminals[clade_value].append(node) + else: + counts['invalid']+=1 + print(f"[clade metadata] {sum(counts.values())} tips: {counts['valid']} valid, {counts['invalid']} invalid, {counts['missing']} missing clade values") + + + print(f"[clade metadata] Pass 2/2 - if (valid) clades are monophyletic then label internal nodes (and missing tips)") + for clade_name, clade_terminals in terminals.items(): + assign_internal_nodes(t, clade_name, clade_terminals, missing_nodes, node_data) with open(args.output_node_data, 'w') as fh: json.dump(node_data, fh) From 7e23eb5cf6836210ec650d9b95add58b26ff377f Mon Sep 17 00:00:00 2001 From: james hadfield Date: Mon, 25 Nov 2024 12:04:51 +1300 Subject: [PATCH 3/3] [INRB] Update clade colours Switch back to the Auspice defaults based on user feedback, but hardcode them here so the colours for each clade stay consistent. --- phylogenetic/build-configs/inrb/auspice_config.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/phylogenetic/build-configs/inrb/auspice_config.json b/phylogenetic/build-configs/inrb/auspice_config.json index 4c711e6..5f5c796 100644 --- a/phylogenetic/build-configs/inrb/auspice_config.json +++ b/phylogenetic/build-configs/inrb/auspice_config.json @@ -15,6 +15,15 @@ ], "build_url": "https://github.com/nextstrain/mpox", "colorings": [ + { + "key": "clade_membership", + "title": "Clade", + "type": "categorical", + "scale": [ + ["Ia", "#4C90C0"], + ["Ib", "#CBB742"] + ] + }, { "key": "region", "title": "Region",