diff --git a/augur/export_v2.py b/augur/export_v2.py index 3cc5fae6a..004a5fa24 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -111,44 +111,63 @@ def order_nodes(node): order_nodes(od['tree']) return od -def convert_tree_to_json_structure(node, metadata, div=0): + +def node_div(T, node_attrs): + """ + Scans the provided tree & metadata to see if divergence is defined, and if so returns + a function which gets it from individual nodes. Divergence may be defined via a number + of sources, and we pick them in the following order: + * metadata.mutation_length (typically via `augur refine`) + * metadata.branch_length (typically via `augur refine`) + * Branch lengths encoded in the Newick tree + + Returns either: + * function with arguments: (node, metadata_for_node) which returns the node divergence + * None (indicates that divergence is not available for this dataset) + """ + if all(('mutation_length' in node_attrs[n.name] for n in T.root.find_clades())): + return lambda node, metadata: metadata['mutation_length'] + if all(('branch_length' in node_attrs[n.name] for n in T.root.find_clades())): + return lambda node, metadata: metadata['branch_length'] + if T.root.branch_length is not None: + return lambda node, metadata: node.branch_length + return None + +def convert_tree_to_json_structure(node, metadata, get_div, div=0): """ converts the Biopython tree structure to a dictionary that can be written to file as a json. This is called recursively. - Creates the name property & divergence on each node - - input - node -- node for which top level dict is produced. - div -- cumulative divergence (root = 0). False → divergence won't be exported. - returns - tree in JSON structure - list of strains + Parameters + ---------- + node : Bio.Phylo.Newick.Clade + metadata : dict + Per-node metadata, with keys matching `node.name` + get_div : + (None or function) + Function returns divergence for this node. Arguments: (node, metadata_for_node) + If None then divergence is not defined for this dataset and so 'div' is not set on returned nodes. + div : int + cumulative divergence leading to the current node (root = 0) + + Returns + ------- + dict: + See schema-export-v2.json#/$defs/tree for full details. + Node names are always set, and divergence is set if applicable """ - - # Does the tree have divergence? (BEAST trees may not) - # only calculate this for the root node! - if div == 0 and 'mutation_length' not in metadata[node.name] and 'branch_length' not in metadata[node.name]: - div = False - node_struct = {'name': node.name, 'node_attrs': {}, 'branch_attrs': {}} - if div is not False: # div=0 is ok + + if get_div is not None: # Store the (cumulative) observed divergence prior to this node node_struct["node_attrs"]["div"] = div if node.clades: node_struct["children"] = [] for child in node.clades: - if div is False: - cdiv=False - else: - if 'mutation_length' in metadata[child.name]: - cdiv = div + metadata[child.name]['mutation_length'] - elif 'branch_length' in metadata[child.name]: - cdiv = div + metadata[child.name]['branch_length'] - else: - print("ERROR: Cannot find branch length information for %s"%(child.name)) - - node_struct["children"].append(convert_tree_to_json_structure(child, metadata, div=cdiv)) + cdiv = div + if get_div: + cdiv += get_div(child, metadata[child.name]) + node_struct["children"].append(convert_tree_to_json_structure(child, metadata, get_div, div=cdiv)) return node_struct @@ -828,10 +847,9 @@ def register_parser(parent_subparsers): title="REQUIRED" ) required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`") - required.add_argument('--node-data', metavar="JSON", required=True, nargs='+', action=ExtendAction, help="JSON files containing metadata for nodes in the tree") - required.add_argument('--output', metavar="JSON", required=True, help="Ouput file (typically for visualisation in auspice)") + required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)") - config = parser.add_argument_group( + config = parser.add_argument_group( title="DISPLAY CONFIGURATION", description="These control the display settings for auspice. \ You can supply a config JSON (which has all available options) or command line arguments (which are more limited but great to get started). \ @@ -849,6 +867,7 @@ def register_parser(parent_subparsers): optional_inputs = parser.add_argument_group( title="OPTIONAL INPUT FILES" ) + optional_inputs.add_argument('--node-data', metavar="JSON", nargs='+', action=ExtendAction, help="JSON files containing metadata for nodes in the tree") optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree") optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") @@ -1068,11 +1087,14 @@ def run(args): data_json = {"version": "v2", "meta": {"updated": time.strftime('%Y-%m-%d')}} #load input files - try: - node_data_file = read_node_data(args.node_data, validation_mode=args.validation_mode) # node_data_files is an array of multiple files (or a single file) - except FileNotFoundError: - print(f"ERROR: node data file ({args.node_data}) does not exist") - sys.exit(2) + if args.node_data is not None: + try: + node_data_file = read_node_data(args.node_data, validation_mode=args.validation_mode) # node_data_files is an array of multiple files (or a single file) + except FileNotFoundError: + print(f"ERROR: node data file ({args.node_data}) does not exist") + sys.exit(2) + else: + node_data_file = {'nodes': {}} if args.metadata is not None: try: @@ -1132,7 +1154,7 @@ def run(args): set_filters(data_json, config) # set tree structure - data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs) + data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs)) set_node_attrs_on_tree(data_json, node_attrs) set_branch_attrs_on_tree(data_json, branch_attrs) diff --git a/tests/functional/export_v2/cram/minimal.t b/tests/functional/export_v2/cram/minimal.t index f3175744c..a423a393a 100644 --- a/tests/functional/export_v2/cram/minimal.t +++ b/tests/functional/export_v2/cram/minimal.t @@ -2,11 +2,10 @@ Setup $ source "$TESTDIR"/_setup.sh -Minimal export +Minimal export -- single input (tree) and single output (dataset JSON) $ ${AUGUR} export v2 \ > --tree "$TESTDIR/../data/tree.nwk" \ - > --node-data "$TESTDIR/../data/div_node-data.json" \ > --output minimal.json WARNING: You didn't provide information on who is maintaining this analysis. @@ -16,7 +15,31 @@ Minimal export Validation of 'minimal.json' succeeded. - $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.json \ +The above minimal.json takes divergence from the newick file. This converts newick divergences of (e.g.) '1' to `1.0` +because BioPython uses floats (which is perfectly reasonable). Remove the decimal to diff the JSON. +(Note that Auspice won't behave any differently) + $ sed 's/\.0//' minimal.json > minimal.no-decimal.json + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.no-decimal.json \ + > --exclude-paths "root['meta']['updated']" + {} + +Almost minimal export -- divergence is encoded via the node-data JSON typically produced by `augur refine` + + $ ${AUGUR} export v2 \ + > --tree "$TESTDIR/../data/tree.nwk" \ + > --node-data "$TESTDIR/../data/div_node-data.json" \ + > --output almost-minimal.json + WARNING: You didn't provide information on who is maintaining this analysis. + + Validating produced JSON + Validating schema of 'almost-minimal.json'... + Validating that the JSON is internally consistent... + Validation of 'almost-minimal.json' succeeded. + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" almost-minimal.json \ > --exclude-paths "root['meta']['updated']" {} diff --git a/tests/test_validate_export.py b/tests/test_validate_export.py index 5cafc92b1..05d8ef025 100644 --- a/tests/test_validate_export.py +++ b/tests/test_validate_export.py @@ -17,14 +17,14 @@ def test_export_without_duplicate_names(self): # Create a tree with unique tip names. tree = Bio.Phylo.read(StringIO("root(A, internal(B, C))"), "newick") metadata = {"A": {}, "B": {}, "C": {}, "root": {}, "internal": {}} - root = convert_tree_to_json_structure(tree.root, metadata) + root = convert_tree_to_json_structure(tree.root, metadata, None) ensure_no_duplicate_names(root, ValidateError) def test_export_with_duplicate_names(self): # Create a tree with duplicate tip names. tree = Bio.Phylo.read(StringIO("root(A, internal(B, B))"), "newick") metadata = {"A": {}, "B": {}, "root": {}, "internal": {}} - root = convert_tree_to_json_structure(tree.root, metadata) + root = convert_tree_to_json_structure(tree.root, metadata, None) with pytest.raises(ValidateError): ensure_no_duplicate_names(root, ValidateError)