Skip to content

Commit

Permalink
[export] allow multiple trees
Browse files Browse the repository at this point in the history
Multiple trees ("subtrees") have been available in Auspice since late
2021¹ and part of the associated schema since early 2022². Despite this
there was no way to produce such datasets within Augur itself, and
despite the schema changes the associated `augur validate` command was
never updated to allow them.

This commit adds multi-tree inputs to `augur export v2` as well as
allowing them to validate with our associated validation commands.

¹ <nextstrain/auspice#1442>
² <#851>
  • Loading branch information
jameshadfield committed May 6, 2024
1 parent 14e32ed commit ddba55a
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 17 deletions.
28 changes: 19 additions & 9 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,13 @@ def order_nodes(node):
return od


def read_tree(fname):
tree = Phylo.read(fname, 'newick')
def read_trees(filenames):
trees = [Phylo.read(fname, 'newick') for fname in filenames]
# augur export requires unique node names (both terminal and external) as these
# are used to associate metadata/node-data with nodes. Any duplication is fatal.
# The exception to this is unlabelled node names, which auspice will handle but
# won't be associated with any metadata within export.
node_names = [clade.name for clade in tree.root.find_clades()]
node_names = [clade.name for tree in trees for clade in tree.root.find_clades()]
if None in node_names:
raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+
"`augur refine --tree <newick> --output-tree <newick>` to name them.")
Expand All @@ -133,7 +133,7 @@ def read_tree(fname):
dups = [name for name, count in Counter(node_names).items() if count>1]
raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " +
", ".join([f"{v!r}" for v in dups[0:5]]) + ("..." if len(dups)>5 else ""))
return (tree, node_names)
return (trees, node_names)


def node_div(T, node_attrs):
Expand Down Expand Up @@ -751,7 +751,12 @@ def _recursively_set_data(node):
node['branch_attrs'] = branch_attrs[node['name']]
for child in node.get("children", []):
_recursively_set_data(child)
_recursively_set_data(data_json["tree"])

if isinstance(data_json["tree"], list):
for subtree in data_json['tree']:
_recursively_set_data(subtree)
else:
_recursively_set_data(data_json["tree"])


def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns):
Expand Down Expand Up @@ -840,7 +845,11 @@ def _recursively_set_data(node):
for child in node.get("children", []):
_recursively_set_data(child)

_recursively_set_data(data_json["tree"])
if isinstance(data_json["tree"], list):
for subtree in data_json['tree']:
_recursively_set_data(subtree)
else:
_recursively_set_data(data_json["tree"])

def node_data_prop_is_normal_trait(name):
# those traits / keys / attrs which are not "special" and can be exported
Expand Down Expand Up @@ -894,7 +903,7 @@ def register_parser(parent_subparsers):
required = parser.add_argument_group(
title="REQUIRED"
)
required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`")
required.add_argument('--tree','-t', metavar="newick", nargs='+', action='extend', required=True, help="Phylogenetic tree(s), usually output from `augur refine`")
required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)")

config = parser.add_argument_group(
Expand Down Expand Up @@ -1192,7 +1201,7 @@ def run(args):
metadata_file = {}

# parse input files
(T, node_names) = read_tree(args.tree)
(trees, node_names) = read_trees(args.tree)
node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \
parse_node_data_and_metadata(node_names, node_data_file, metadata_file)
config = get_config(args)
Expand Down Expand Up @@ -1224,7 +1233,8 @@ def run(args):
set_filters(data_json, config)

# set tree structure
data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs))
trees_json = [convert_tree_to_json_structure(tree.root, node_attrs, node_div(tree, node_attrs)) for tree in trees]
data_json["tree"] = trees_json[0] if len(trees_json)==1 else trees_json
set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns)
set_branch_attrs_on_tree(data_json, branch_attrs)

Expand Down
36 changes: 28 additions & 8 deletions augur/validate_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
from collections import defaultdict

def ensure_no_duplicate_names(root, ValidateError):
def ensure_no_duplicate_names(tree, ValidateError):
"""
Check that all node names are identical, which is required for auspice (v2) JSONs.
"""
Expand All @@ -18,10 +18,14 @@ def recurse(node):
names.add(node["name"])
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)
if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)


def collectTreeAttrsV2(root, warn):
def collectTreeAttrsV2(tree, warn):
"""
Collect all keys specified on `node["node_attrs"]` throughout the tree
and the values associated with them. Note that this will only look at
Expand All @@ -47,7 +51,12 @@ def recurse(node):
[recurse(child) for child in node["children"]]
else:
num_terminal += 1
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)

for data in seen.values():
if data["count"] == num_nodes:
Expand All @@ -56,7 +65,7 @@ def recurse(node):
return(seen, num_terminal)


def collectMutationGenes(root):
def collectMutationGenes(tree):
"""
Returns a set of all genes specified in the tree in the "aa_muts" objects
"""
Expand All @@ -67,17 +76,28 @@ def recurse(node):
genes.update(mutations.keys())
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)

genes -= {"nuc"}
return genes

def collectBranchLabels(root):
def collectBranchLabels(tree):
labels = set()
def recurse(node):
labels.update(node.get("branch_attrs", {}).get("labels", {}).keys())
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)
return labels

def verifyMainJSONIsInternallyConsistent(data, ValidateError):
Expand Down
53 changes: 53 additions & 0 deletions tests/functional/export_v2/cram/multi-tree.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

Setup

$ source "$TESTDIR"/_setup.sh

Create a small second tree (which has different names/labels than 'tree.nwk')
$ cat > tree2.nwk <<~~
> (tipG:1,(tipH:1,tipI:1)internalHI:2)SECOND_ROOT:0;
> ~~

Minimal export -- no node data, no metadata etc etc
$ ${AUGUR} export v2 \
> --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
> --output minimal.json &> /dev/null

More realistic export - with node_data for all nodes and metadata for some of them

$ cat > metadata.tsv <<~~
> strain something
> tipA foo
> tipB foo
> tipC foo
> tipG bar
> tipH bar
> ~~


$ cat > node-data.json <<~~
> {
> "nodes": {
> "ROOT": {"mutation_length": 0},
> "tipA": {"mutation_length": 1},
> "internalBC": {"mutation_length": 2},
> "tipB": {"mutation_length": 1},
> "tipC": {"mutation_length": 1},
> "internalDEF": {"mutation_length": 5},
> "tipD": {"mutation_length": 3},
> "tipE": {"mutation_length": 4},
> "tipF": {"mutation_length": 1},
> "SECOND_ROOT": {"mutation_length": 0},
> "tipG": {"mutation_length": 1},
> "internalHI": {"mutation_length": 2},
> "tipH": {"mutation_length": 1},
> "tipI": {"mutation_length": 1}
> }
> }
> ~~

$ ${AUGUR} export v2 \
> --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
> --metadata metadata.tsv --color-by-metadata something \
> --node-data node-data.json \
> --output output.json &> /dev/null

0 comments on commit ddba55a

Please sign in to comment.