Skip to content

Commit

Permalink
Merge pull request #294 from nextstrain/james/inrb-updates
Browse files Browse the repository at this point in the history
INRB updates
  • Loading branch information
jameshadfield authored Nov 24, 2024
2 parents 12996b6 + 7e23eb5 commit 4897f36
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 31 deletions.
9 changes: 9 additions & 0 deletions phylogenetic/build-configs/inrb/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@
],
"build_url": "https://github.com/nextstrain/mpox",
"colorings": [
{
"key": "clade_membership",
"title": "Clade",
"type": "categorical",
"scale": [
["Ia", "#4C90C0"],
["Ib", "#CBB742"]
]
},
{
"key": "region",
"title": "Region",
Expand Down
6 changes: 6 additions & 0 deletions phylogenetic/build-configs/inrb/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,9 @@ private_metadata: "data/metadata-private.tsv"
traits:
columns: region country division location
sampling_bias_correction: 3

# Private INRB data doesn't have clade annotations so allow empty clade fields
# (i.e. we're assuming all INRB data is clade I)
subsample:
everything:
query: "'clade in [\"I\", \"Ia\", \"Ib\", \"\"]'"
10 changes: 6 additions & 4 deletions phylogenetic/build-configs/inrb/curate_private_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from datetime import datetime
from os import path, mkdir
from sys import exit
import csv

Sequences = dict[str, SeqIO.SeqRecord]
Metadata = dict[str, dict[str, str]]
Expand Down Expand Up @@ -238,10 +239,11 @@ def write_sequences(sequences: Sequences) -> None:
def write_metadata(metadata: Metadata, header: MetadataHeader) -> None:
fname = fname_in_data_dir("metadata-private.tsv")
print(f"Writing metadata to {fname}")
with open(fname, "w") as fh:
print("\t".join(header), file=fh)
for _, value in metadata.items():
print("\t".join([value[field] for field in header]), file=fh)
with open(fname, "w", newline='') as fh:
writer = csv.DictWriter(fh, header, extrasaction='ignore', delimiter='\t', lineterminator='\n')
writer.writeheader()
for line in metadata.values():
writer.writerow(line)

def parse_remap_columns(arg: list[str]) -> list[tuple[str, str]]:
try:
Expand Down
58 changes: 35 additions & 23 deletions phylogenetic/scripts/assign-clades-via-metadata.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
"""
Use provided metadata to assign clade to terminal nodes. If the metadata is complete (i.e. all tips are assigned a clade)
then for each clade we assign the clade to internal nodes and set a label for the MRCA node, iff the tips are monophyletic.
Use provided metadata to assign clades to internal nodes and those with missing metadata.
For each valid clades (i.e. those which match a hardcoded list) as long as the tips are monophyletic
in the tree (ignoring missing data) then we label all internal nodes and missing tips with that clade
as well as labelling the MRCA branch.
"""

import argparse
from sys import stderr, stdout, exit
from sys import exit
from Bio import Phylo
import json
from augur.io import read_metadata
from augur.argparse_ import ExtendOverwriteDefault
from collections import defaultdict

VALID_CLADES = set(['Ia', 'Ib', 'I'])
MISSING = ''

def assign_internal_nodes(clade_name, terminal_nodes, node_data):
print(f"Assigning {clade_name} to internal nodes")
mrca = t.is_monophyletic(terminal_nodes)
if not mrca:
print(f"WARNING: {clade_name} wasn't monophyletic! Clades will not be assigned to internal nodes")
def assign_internal_nodes(t, clade_name, terminal_nodes, missing_nodes, node_data):
print(f"[clade metadata] Assigning {clade_name} to internal nodes & labelling MRCA branch")
mrca = t.common_ancestor(terminal_nodes)

if not all([(n in terminal_nodes or n in missing_nodes) for n in mrca.get_terminals()]):
print(f"[clade metadata] ERROR {clade_name} wasn't monophyloetic (after accounting for nodes missing clade values). Clades will not be assigned to internal nodes.")
return
for node in mrca.get_nonterminals():

for node in (n for n in mrca.find_clades() if n not in terminal_nodes): # skip ones we have already assigned
node_data['nodes'][node.name] = {'clade_membership': clade_name}
node_data['branches'][mrca.name] = {'labels': {'clade': clade_name}}

Expand All @@ -41,23 +47,29 @@ def assign_internal_nodes(clade_name, terminal_nodes, node_data):
t = Phylo.read(args.tree, "newick")

node_data = {'nodes': {}, 'branches': {}}
counts = [0,0]
counts = {'missing': 0, 'valid': 0, 'invalid': 0}
terminals = defaultdict(list)
missing_nodes = []


print(f"[clade metadata] Pass 1/2 - assigning clades from metadata to nodes")
for node in t.get_terminals():
counts[0]+=1
if node.name in clades:
counts[1]+=1
node_data['nodes'][node.name] = {'clade_membership': clades[node.name]}
terminals[clades[node.name]].append(node)

print(f"Metadata defined clades for {counts[1]}/{counts[0]} tips in tree", file=stderr)

if counts[0]==counts[1]:
for clade_name, clade_terminals in terminals.items():
assign_internal_nodes(clade_name, clade_terminals, node_data)
else:
print(f"WARNING: incomplete metadata. Assignment of clade labels is uncertain and thus we are skipping it")
clade_value = clades[node.name]
if clade_value == MISSING:
counts['missing']+=1
missing_nodes.append(node)
elif clade_value in VALID_CLADES:
counts['valid']+=1
node_data['nodes'][node.name] = {'clade_membership': clade_value}
terminals[clade_value].append(node)
else:
counts['invalid']+=1
print(f"[clade metadata] {sum(counts.values())} tips: {counts['valid']} valid, {counts['invalid']} invalid, {counts['missing']} missing clade values")


print(f"[clade metadata] Pass 2/2 - if (valid) clades are monophyletic then label internal nodes (and missing tips)")
for clade_name, clade_terminals in terminals.items():
assign_internal_nodes(t, clade_name, clade_terminals, missing_nodes, node_data)

with open(args.output_node_data, 'w') as fh:
json.dump(node_data, fh)
9 changes: 5 additions & 4 deletions phylogenetic/scripts/combine_data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def parse_tsv(fname: str) -> Metadata:
assert list(fname).count('=')<=1, f"Too many '=' characters in argument {fname!r}"
if '=' in fname:
source_name, fname = fname.split('=')
with open(fname, "r") as fh:
with open(fname, "r", newline='') as fh:
reader = csv.DictReader(fh, delimiter="\t")
metadata = [row for row in reader]
if source_name:
Expand Down Expand Up @@ -77,10 +77,11 @@ def write_sequences(fname: str, sequences: Sequences) -> None:

def write_metadata(fname: str, metadata: Metadata, header: MetadataHeader) -> None:
print(f"Writing metadata to {fname}")
with open(fname, "w") as fh:
print("\t".join(header), file=fh)
with open(fname, "w", newline='') as fh:
writer = csv.DictWriter(fh, header, extrasaction='ignore', delimiter='\t', lineterminator='\n')
writer.writeheader()
for row in metadata:
print("\t".join([row.get(field, '') for field in header]), file=fh)
writer.writerow(row)

if __name__=="__main__":
args = parse_args()
Expand Down

0 comments on commit 4897f36

Please sign in to comment.