From ddba55a9ad1470b137038507b369bf83751a15dd Mon Sep 17 00:00:00 2001 From: james hadfield Date: Wed, 24 Apr 2024 10:29:53 +1200 Subject: [PATCH] [export] allow multiple trees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multiple trees ("subtrees") have been available in Auspice since late 2021¹ and part of the associated schema since early 2022². Despite this there was no way to produce such datasets within Augur itself, and despite the schema changes the associated `augur validate` command was never updated to allow them. This commit adds multi-tree inputs to `augur export v2` as well as allowing them to validate with our associated validation commands. ¹ ² --- augur/export_v2.py | 28 +++++++---- augur/validate_export.py | 36 ++++++++++--- tests/functional/export_v2/cram/multi-tree.t | 53 ++++++++++++++++++++ 3 files changed, 100 insertions(+), 17 deletions(-) create mode 100644 tests/functional/export_v2/cram/multi-tree.t diff --git a/augur/export_v2.py b/augur/export_v2.py index 05a9fac24..2b4a7cb2b 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -118,13 +118,13 @@ def order_nodes(node): return od -def read_tree(fname): - tree = Phylo.read(fname, 'newick') +def read_trees(filenames): + trees = [Phylo.read(fname, 'newick') for fname in filenames] # augur export requires unique node names (both terminal and external) as these # are used to associate metadata/node-data with nodes. Any duplication is fatal. # The exception to this is unlabelled node names, which auspice will handle but # won't be associated with any metadata within export. - node_names = [clade.name for clade in tree.root.find_clades()] + node_names = [clade.name for tree in trees for clade in tree.root.find_clades()] if None in node_names: raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+ "`augur refine --tree --output-tree ` to name them.") @@ -133,7 +133,7 @@ def read_tree(fname): dups = [name for name, count in Counter(node_names).items() if count>1] raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " + ", ".join([f"{v!r}" for v in dups[0:5]]) + ("..." if len(dups)>5 else "")) - return (tree, node_names) + return (trees, node_names) def node_div(T, node_attrs): @@ -751,7 +751,12 @@ def _recursively_set_data(node): node['branch_attrs'] = branch_attrs[node['name']] for child in node.get("children", []): _recursively_set_data(child) - _recursively_set_data(data_json["tree"]) + + if isinstance(data_json["tree"], list): + for subtree in data_json['tree']: + _recursively_set_data(subtree) + else: + _recursively_set_data(data_json["tree"]) def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns): @@ -840,7 +845,11 @@ def _recursively_set_data(node): for child in node.get("children", []): _recursively_set_data(child) - _recursively_set_data(data_json["tree"]) + if isinstance(data_json["tree"], list): + for subtree in data_json['tree']: + _recursively_set_data(subtree) + else: + _recursively_set_data(data_json["tree"]) def node_data_prop_is_normal_trait(name): # those traits / keys / attrs which are not "special" and can be exported @@ -894,7 +903,7 @@ def register_parser(parent_subparsers): required = parser.add_argument_group( title="REQUIRED" ) - required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`") + required.add_argument('--tree','-t', metavar="newick", nargs='+', action='extend', required=True, help="Phylogenetic tree(s), usually output from `augur refine`") required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)") config = parser.add_argument_group( @@ -1192,7 +1201,7 @@ def run(args): metadata_file = {} # parse input files - (T, node_names) = read_tree(args.tree) + (trees, node_names) = read_trees(args.tree) node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \ parse_node_data_and_metadata(node_names, node_data_file, metadata_file) config = get_config(args) @@ -1224,7 +1233,8 @@ def run(args): set_filters(data_json, config) # set tree structure - data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs)) + trees_json = [convert_tree_to_json_structure(tree.root, node_attrs, node_div(tree, node_attrs)) for tree in trees] + data_json["tree"] = trees_json[0] if len(trees_json)==1 else trees_json set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns) set_branch_attrs_on_tree(data_json, branch_attrs) diff --git a/augur/validate_export.py b/augur/validate_export.py index 5cd3f4c4c..80da43bfe 100644 --- a/augur/validate_export.py +++ b/augur/validate_export.py @@ -7,7 +7,7 @@ import sys from collections import defaultdict -def ensure_no_duplicate_names(root, ValidateError): +def ensure_no_duplicate_names(tree, ValidateError): """ Check that all node names are identical, which is required for auspice (v2) JSONs. """ @@ -18,10 +18,14 @@ def recurse(node): names.add(node["name"]) if "children" in node: [recurse(child) for child in node["children"]] - recurse(root) + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) -def collectTreeAttrsV2(root, warn): +def collectTreeAttrsV2(tree, warn): """ Collect all keys specified on `node["node_attrs"]` throughout the tree and the values associated with them. Note that this will only look at @@ -47,7 +51,12 @@ def recurse(node): [recurse(child) for child in node["children"]] else: num_terminal += 1 - recurse(root) + + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) for data in seen.values(): if data["count"] == num_nodes: @@ -56,7 +65,7 @@ def recurse(node): return(seen, num_terminal) -def collectMutationGenes(root): +def collectMutationGenes(tree): """ Returns a set of all genes specified in the tree in the "aa_muts" objects """ @@ -67,17 +76,28 @@ def recurse(node): genes.update(mutations.keys()) if "children" in node: [recurse(child) for child in node["children"]] - recurse(root) + + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) + genes -= {"nuc"} return genes -def collectBranchLabels(root): +def collectBranchLabels(tree): labels = set() def recurse(node): labels.update(node.get("branch_attrs", {}).get("labels", {}).keys()) if "children" in node: [recurse(child) for child in node["children"]] - recurse(root) + + if isinstance(tree, list): + for subtree in tree: + recurse(subtree) + else: + recurse(tree) return labels def verifyMainJSONIsInternallyConsistent(data, ValidateError): diff --git a/tests/functional/export_v2/cram/multi-tree.t b/tests/functional/export_v2/cram/multi-tree.t new file mode 100644 index 000000000..a7a45b948 --- /dev/null +++ b/tests/functional/export_v2/cram/multi-tree.t @@ -0,0 +1,53 @@ + +Setup + + $ source "$TESTDIR"/_setup.sh + +Create a small second tree (which has different names/labels than 'tree.nwk') + $ cat > tree2.nwk <<~~ + > (tipG:1,(tipH:1,tipI:1)internalHI:2)SECOND_ROOT:0; + > ~~ + +Minimal export -- no node data, no metadata etc etc + $ ${AUGUR} export v2 \ + > --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \ + > --output minimal.json &> /dev/null + +More realistic export - with node_data for all nodes and metadata for some of them + + $ cat > metadata.tsv <<~~ + > strain something + > tipA foo + > tipB foo + > tipC foo + > tipG bar + > tipH bar + > ~~ + + + $ cat > node-data.json <<~~ + > { + > "nodes": { + > "ROOT": {"mutation_length": 0}, + > "tipA": {"mutation_length": 1}, + > "internalBC": {"mutation_length": 2}, + > "tipB": {"mutation_length": 1}, + > "tipC": {"mutation_length": 1}, + > "internalDEF": {"mutation_length": 5}, + > "tipD": {"mutation_length": 3}, + > "tipE": {"mutation_length": 4}, + > "tipF": {"mutation_length": 1}, + > "SECOND_ROOT": {"mutation_length": 0}, + > "tipG": {"mutation_length": 1}, + > "internalHI": {"mutation_length": 2}, + > "tipH": {"mutation_length": 1}, + > "tipI": {"mutation_length": 1} + > } + > } + > ~~ + + $ ${AUGUR} export v2 \ + > --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \ + > --metadata metadata.tsv --color-by-metadata something \ + > --node-data node-data.json \ + > --output output.json &> /dev/null