Skip to content

Commit

Permalink
[export] Node-data files are optional
Browse files Browse the repository at this point in the history
Allows a minimal `augur export` using only a (newick) tree as input,
functionality that we've wanted for over 4 years! To facilitate this we
parse branch lengths¹ from the newick file if such data wasn't available
in the node-data inputs (e.g. because there are none!).

The code for deciding where to read divergence from has been refactored
and in the process improved: the (rare? never encountered?) case where
divergence was sometimes read from node-data keys 'mutation_length' and
sometimes from 'branch_length' can non longer happen.

If data is provided which doesn't define divergence or num_date
(irregardless of whether node-data files were provided as inputs), then
the resulting dataset will fail validation.

Closes #273 <#273>

¹ I suppose these might represent time in certain cases, but I haven't
seen such data in Newick files.
  • Loading branch information
jameshadfield committed Aug 31, 2023
1 parent 4701ba2 commit 4991b9a
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 41 deletions.
93 changes: 57 additions & 36 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,44 +111,62 @@ def order_nodes(node):
order_nodes(od['tree'])
return od

def convert_tree_to_json_structure(node, metadata, div=0):

def node_div(T, node_attrs):
"""
Scans the provided tree & metadata to see if divergence is defined, and if so returns
a function which gets it from individual nodes. Divergence may be defined via a number
of sources, and we pick them in the following order:
* metadata.mutation_length (typically via `augur refine`)
* metadata.branch_length (typically via `augur refine`)
* Branch lengths encoded in the Newick tree
Returns either:
* function with arguments: (node, metadata_for_node) which returns the node divergence
* None (indicates that divergence is not available for this dataset)
"""
if all(('mutation_length' in node_attrs[n.name] for n in T.root.find_clades())):
return lambda node, metadata: metadata['mutation_length']
if all(('branch_length' in node_attrs[n.name] for n in T.root.find_clades())):
return lambda node, metadata: metadata['branch_length']
if T.root.branch_length is not None:
return lambda node, metadata: node.branch_length
return None

def convert_tree_to_json_structure(node, metadata, get_div, div=0):
"""
converts the Biopython tree structure to a dictionary that can
be written to file as a json. This is called recursively.
Creates the name property & divergence on each node
input
node -- node for which top level dict is produced.
div -- cumulative divergence (root = 0). False → divergence won't be exported.
returns
tree in JSON structure
list of strains
Parameters
----------
node : Bio.Phylo.Newick.Clade
metadata : dict
Per-node metadata, with keys matching `node.name`
get_div : (None | function)
Function returns divergence for this node. Arguments: (node, metadata_for_node)
If None then divergence is not defined for this dataset and so 'div' is not set on returned nodes.
div : int
cumulative divergence leading to the current node (root = 0)
Returns
-------
dict:
See schema-export-v2.json#/$defs/tree for full details.
Node names are always set, and divergence is set if applicable
"""

# Does the tree have divergence? (BEAST trees may not)
# only calculate this for the root node!
if div == 0 and 'mutation_length' not in metadata[node.name] and 'branch_length' not in metadata[node.name]:
div = False

node_struct = {'name': node.name, 'node_attrs': {}, 'branch_attrs': {}}
if div is not False: # div=0 is ok

if get_div is not None: # Store the (cumulative) observed divergence prior to this node
node_struct["node_attrs"]["div"] = div

if node.clades:
node_struct["children"] = []
for child in node.clades:
if div is False:
cdiv=False
else:
if 'mutation_length' in metadata[child.name]:
cdiv = div + metadata[child.name]['mutation_length']
elif 'branch_length' in metadata[child.name]:
cdiv = div + metadata[child.name]['branch_length']
else:
print("ERROR: Cannot find branch length information for %s"%(child.name))

node_struct["children"].append(convert_tree_to_json_structure(child, metadata, div=cdiv))
cdiv = div
if get_div:
cdiv += get_div(child, metadata[child.name])
node_struct["children"].append(convert_tree_to_json_structure(child, metadata, get_div, div=cdiv))

return node_struct

Expand Down Expand Up @@ -828,10 +846,9 @@ def register_parser(parent_subparsers):
title="REQUIRED"
)
required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`")
required.add_argument('--node-data', metavar="JSON", required=True, nargs='+', action=ExtendAction, help="JSON files containing metadata for nodes in the tree")
required.add_argument('--output', metavar="JSON", required=True, help="Ouput file (typically for visualisation in auspice)")
required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)")

config = parser.add_argument_group(
config = parser.add_argument_group(
title="DISPLAY CONFIGURATION",
description="These control the display settings for auspice. \
You can supply a config JSON (which has all available options) or command line arguments (which are more limited but great to get started). \
Expand All @@ -849,6 +866,7 @@ def register_parser(parent_subparsers):
optional_inputs = parser.add_argument_group(
title="OPTIONAL INPUT FILES"
)
optional_inputs.add_argument('--node-data', metavar="JSON", nargs='+', action=ExtendAction, help="JSON files containing metadata for nodes in the tree")
optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree")
optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
Expand Down Expand Up @@ -1068,11 +1086,14 @@ def run(args):
data_json = {"version": "v2", "meta": {"updated": time.strftime('%Y-%m-%d')}}

#load input files
try:
node_data_file = read_node_data(args.node_data, validation_mode=args.validation_mode) # node_data_files is an array of multiple files (or a single file)
except FileNotFoundError:
print(f"ERROR: node data file ({args.node_data}) does not exist")
sys.exit(2)
if args.node_data is not None:
try:
node_data_file = read_node_data(args.node_data, validation_mode=args.validation_mode) # node_data_files is an array of multiple files (or a single file)
except FileNotFoundError:
print(f"ERROR: node data file ({args.node_data}) does not exist")
sys.exit(2)
else:
node_data_file = {'nodes': {}}

if args.metadata is not None:
try:
Expand Down Expand Up @@ -1132,7 +1153,7 @@ def run(args):
set_filters(data_json, config)

# set tree structure
data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs)
data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs))
set_node_attrs_on_tree(data_json, node_attrs)
set_branch_attrs_on_tree(data_json, branch_attrs)

Expand Down
29 changes: 26 additions & 3 deletions tests/functional/export_v2/cram/minimal.t
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@ Setup

$ source "$TESTDIR"/_setup.sh

Minimal export
Minimal export -- single input (tree) and single output (dataset JSON)

$ ${AUGUR} export v2 \
> --tree "$TESTDIR/../data/tree.nwk" \
> --node-data "$TESTDIR/../data/div_node-data.json" \
> --output minimal.json
WARNING: You didn't provide information on who is maintaining this analysis.

Expand All @@ -16,7 +15,31 @@ Minimal export
Validation of 'minimal.json' succeeded.


$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.json \
The above minimal.json takes divergence from the newick file. This converts newick divergences of (e.g.) '1' to `1.0`
because BioPython uses floats (which is perfectly reasonable). Remove the decimal to diff the JSON.
(Note that Auspice won't behave any differently)
$ sed 's/\.0//' minimal.json > minimal.no-decimal.json


$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.no-decimal.json \
> --exclude-paths "root['meta']['updated']"
{}

Almost minimal export -- divergence is encoded via the node-data JSON typically produced by `augur refine`

$ ${AUGUR} export v2 \
> --tree "$TESTDIR/../data/tree.nwk" \
> --node-data "$TESTDIR/../data/div_node-data.json" \
> --output almost-minimal.json
WARNING: You didn't provide information on who is maintaining this analysis.

Validating produced JSON
Validating schema of 'almost-minimal.json'...
Validating that the JSON is internally consistent...
Validation of 'almost-minimal.json' succeeded.


$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" almost-minimal.json \
> --exclude-paths "root['meta']['updated']"
{}

Expand Down
4 changes: 2 additions & 2 deletions tests/test_validate_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def test_export_without_duplicate_names(self):
# Create a tree with unique tip names.
tree = Bio.Phylo.read(StringIO("root(A, internal(B, C))"), "newick")
metadata = {"A": {}, "B": {}, "C": {}, "root": {}, "internal": {}}
root = convert_tree_to_json_structure(tree.root, metadata)
root = convert_tree_to_json_structure(tree.root, metadata, None)
ensure_no_duplicate_names(root, ValidateError)

def test_export_with_duplicate_names(self):
# Create a tree with duplicate tip names.
tree = Bio.Phylo.read(StringIO("root(A, internal(B, B))"), "newick")
metadata = {"A": {}, "B": {}, "root": {}, "internal": {}}
root = convert_tree_to_json_structure(tree.root, metadata)
root = convert_tree_to_json_structure(tree.root, metadata, None)

with pytest.raises(ValidateError):
ensure_no_duplicate_names(root, ValidateError)

0 comments on commit 4991b9a

Please sign in to comment.