From fbbc319eb26d463a32da225f7b96ee7023b5df86 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Wed, 30 Aug 2023 16:27:14 +1200 Subject: [PATCH 1/9] Remove unused properties from schema These properties were never actually used (neither exported from augur export v2 nor consumed by auspice) Closes #867 --- augur/data/schema-export-v2.json | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json index 75171ce04..b14882773 100644 --- a/augur/data/schema-export-v2.json +++ b/augur/data/schema-export-v2.json @@ -251,18 +251,6 @@ } } }, - "tree_name" : { - "description": "The name of the tree (e.g. segment name), if applicable", - "$comment": "This is required if you want to view two trees side-by-side", - "$comment": "It should match a field in the JSON filename after splitting on '_'", - "$comment": "e.g. `flu_h3n2_ha_3y` has a tree name of `ha`", - "type": "string" - }, - "frequencies": { - "$comment": "Frequencies could be specified here if desired", - "$comment": "If not specified, and frequencies are asked for in #/panels, then Auspice will attempt to fetch a seperate JSON", - "$comment": "cc John / Trevor" - }, "data_provenance": { "description": "Specify provenance of data included in this analysis", "type": "array", From 67689a72505bc5776819c62b9242991b02e5441f Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Wed, 30 Aug 2023 16:52:50 +1200 Subject: [PATCH 2/9] De-duplicate auspice-config + dataset schemas A number of parts of the auspice-config have identical (or almost-identical) shape to those in the resulting dataset JSON, although the actual data may be modified as it passes through `augur export v2`. Rather than referencing the entire auspice-config schema and pruning down properties, which I don't actually think is possible in jsonschema, I chose to use $refs at a more fine grained level which I find easier to read. The actual schema definitions should be unchanged by this commit, although comments / descriptions have been improved. --- augur/data/schema-auspice-config-v2.json | 36 +++-- augur/data/schema-export-v2.json | 189 ++--------------------- augur/validate.py | 1 + 3 files changed, 34 insertions(+), 192 deletions(-) diff --git a/augur/data/schema-auspice-config-v2.json b/augur/data/schema-auspice-config-v2.json index e7c507fea..87970092c 100644 --- a/augur/data/schema-auspice-config-v2.json +++ b/augur/data/schema-auspice-config-v2.json @@ -12,12 +12,12 @@ "type" : "string" }, "colorings": { - "description": "Set traits to be available as color-dropdown options", + "description": "Traits available as color-by options", "type": "array", "minItems": 1, "items": { "type": "object", - "description": "An indiviual color-by for auspice", + "description": "Each object here is an individual coloring, which will populate the sidebar dropdown in auspice", "additionalProperties": false, "required": ["key"], "properties": { @@ -32,19 +32,20 @@ }, "type": { "description": "Defines how the color scale should be constructed", + "$comment": "[augur export v2] will (try to) infer the type if this is not present", "type": "string", "enum": ["continuous", "temporal", "ordinal", "categorical", "boolean"] }, "scale": { - "description": "Provided mapping between trait values & hex values", - "$comment": "NOTE: if supplied here, we will not use information supplied to `augur export` via `--colors` for this coloring.", + "description": "Provided mapping between trait values & hex values. For continuous scales at least 2 items must be specified", + "$comment": "[auspice export v2] preferentially uses this over colors TSV", "type": "array", "items": { "type": "array", "items": [ { "type": ["string", "number"], - "description": "For categorical/ordinal scales, this is the (string) value of the trait to associate with the colour. For continuous scales this is the (numeric) value to associate to with the colour, and interpolation will be used to span the domain" + "description": "For categorical/ordinal scales, this is the (string) value of the trait to associate with the colour. For continuous scales this is the (numeric) value to associate with the colour, and interpolation will be used to span the domain" }, {"type": "string", "description": "color hex value", "pattern": "^#[0-9A-Fa-f]{6}$"} ] @@ -143,18 +144,19 @@ }, "build_url": { "description": "URL with instructions to reproduce build, usually expected to be a GitHub repo URL", - "$comment": "optional", + "$comment": "Auspice displays this at the top of the page as part of a byline", "type": "string" }, "filters": { - "type": "array", - "uniqueItems": true, - "minItems": 0, - "items": { - "type": "string" - } + "description": "These appear as filters in the footer of Auspice (which populates the displayed values based upon the tree)", + "$comment": "These values must be present as keys on a tree node -> trait", + "type": "array", + "uniqueItems": true, + "items": {"type": "string"} }, "display_defaults": { + "description": "Set the defaults for certain display options in Auspice. All are optional.", + "$comment": "Anything able to be encoded in the auspice URL should eventually be an option here, so this will expand over time", "type": "object", "additionalProperties": false, "properties": { @@ -162,9 +164,11 @@ "type": "boolean" }, "geo_resolution": { + "$comment": "The value here must be present in geo_resolutions (see above)", "type": "string" }, "color_by": { + "$comment": "The value here must be present in the colorings (see above)", "type": "string" }, "distance_measure": { @@ -182,6 +186,7 @@ "pattern": "^(none|[a-zA-Z0-9]+)$" }, "transmission_lines": { + "$comment": "Transmission lines depend on the geo_resolution being defined for internal nodes", "type": "boolean" }, "language": { @@ -195,8 +200,7 @@ }, "panels": { "type": "array", - "description": "The panels to display by default.", - "$comment": "optional, but if present must not be empty", + "description": "Panels which start toggled on (default is for all available to be shown)", "minItems": 1, "items": { "type": "string", @@ -217,9 +221,9 @@ "type": "string" }, "panels": { + "description": "The panels available for display", + "$comment": "The frequencies & measurements panel will only be available if defined here (and if their sidecar files are available)", "type": "array", - "description": "The panels available for display.", - "$comment": "optional", "minItems": 1, "items": { "type": "string", diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json index b14882773..ea80ce797 100644 --- a/augur/data/schema-export-v2.json +++ b/augur/data/schema-export-v2.json @@ -27,8 +27,7 @@ "pattern": "^[0-9X]{4}-[0-9X]{2}-[0-9X]{2}$" }, "build_url" : { - "description": "Auspice displays this at the top of the page as part of a byline", - "type" : "string" + "$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/build_url" }, "description" : { "description": "Auspice displays this currently in the footer.", @@ -36,44 +35,19 @@ "type": "string" }, "maintainers": { - "description": "Who maintains this dataset?", - "$comment": "order similar to a publication", - "type": "array", - "uniqueItems": true, - "minItems": 1, - "items": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "url": {"type": "string"} - }, - "required": ["name"] - } + "$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/maintainers" }, "genome_annotations": { "$ref": "https://nextstrain.org/schemas/augur/annotations" }, "filters": { - "description": "These appear as filters in the footer of Auspice (which populates the displayed values based upon the tree)", - "$comment": "These values must be present as keys on a tree node -> trait", - "type": "array", - "uniqueItems": true, - "items": {"type": "string"} + "$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/filters" }, "panels": { - "description": "Which panels should Auspice display?", - "$comment": "If additional JSONs are required (e.g. for frequencies), they will be fetched after parsing this array", - "type": "array", - "items": { - "type": "string", - "enum": ["tree", "map", "frequencies", "entropy", "measurements"] - }, - "uniqueItems": true, - "minItems": 1 + "$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/panels" }, "extensions": { - "description": "Data to be passed through to the the resulting dataset JSON", - "$comment": "Any type is accepted" + "$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/extensions" }, "geo_resolutions": { "description": "The available options for the geographic resolution dropdown, and their lat/long information", @@ -81,7 +55,7 @@ "minItems": 1, "items": { "type": "object", - "description": "Each object here is an indiviual geo resolution", + "description": "Each object here is an individual geo resolution", "additionalProperties": false, "required": ["key", "demes"], "properties": { @@ -95,12 +69,10 @@ }, "demes": { "type": "object", - "description": "The deme names & lat/long info for this geographic resolution", + "description": "Mapping from deme (trait values) to lat/long", "$comment": "Each value defined across the tree needs to be present here, else Auspice cannot display the deme appropriately", "patternProperties": { "^[a-z_]+$": { - "description": "Lat/long info for this deme", - "$comment": "one day this may define a shape / polygon", "type": "object", "additionalProperties": false, "properties": { @@ -122,155 +94,20 @@ } }, "colorings": { - "description": "Available colorBys for Auspice", "type": "array", "minItems": 1, "items": { - "type": "object", - "description": "Each object here is an individual coloring, which will populate the sidebar dropdown in auspice", - "required": ["key", "type"], - "properties": { - "key": { - "description": "They key used to access the value of this coloring on each node", - "type": "string" - }, - "title": { - "description": "Text to be displayed in the \"color by\" dropdown and the tree legend", - "$comment": "string is parsed unchanged by Auspice", - "type": "string" - }, - "type": { - "description": "Dictates how the color scale should be constructed", - "$comment": "The trait values (defined on tree nodes) must be numeric for continuous types, True / False for boolean, string or numeric for ordinal / categorical", - "type": "string", - "enum": ["continuous", "temporal", "ordinal", "categorical", "boolean"] - }, - "scale": { - "description": "Provided mapping between trait values & hex values", - "$comment": "For continuous scales at least 2 items must be specified", - "type": "array", - "items": { - "type": "array", - "items": [ - { - "type": ["string", "number"], - "description": "For categorical/ordinal scales, this is the (string) value of the trait to associate with the colour. For continuous scales this is the (numeric) value to associate to with the colour, and interpolation will be used to span the domain" - }, - {"type": "string", "description": "color hex value", "pattern": "^#[0-9A-Fa-f]{6}$"} - ] - } - }, - "legend": { - "description": "Specify the entries displayed in the legend. This can be used to restrict the entries in the legend for display without otherwise affecting the data viz", - "type": "array", - "items": { - "type": "object", - "required": ["value"], - "properties": { - "value": { - "description": "value to associate with this legend entry. Used to determine colour. For non-continuous scales this also determines the matching between legend items and data.", - "type": ["string", "number"], - "$comment": "Continuous scales must use a numeric value. Other scales can use either." - }, - "display": { - "description": "Label to display in the legend. Optional - `value` will be used if this is not provided.", - "type": ["string", "number"] - }, - "bounds": { - "description": "(for continuous scales only) provide the lower & upper bounds to match data to this legend entry. Bounds from different legend entries must not overlap. Matching is (a, b] - exclusive of the lower bound, inclusive of the upper.", - "type": "array", - "items": [ - {"type": "number", "description": "lower bound"}, - {"type": "number", "description": "upper bound"} - ] - } - } - } - } - } + "allOf": [ + {"$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/colorings/items"} + ], + "required": ["key", "type"] } }, "display_defaults": { - "description": "Set the defaults for certain display options in Auspice. All are optional.", - "$comment": "Anything able to be encoded in the auspice URL should be an option here, so this will expand over time", - "type": "object", - "additionalProperties": false, - "properties": { - "geo_resolution": { - "description": "Default geographic resolution", - "$comment": "The value here must be present in the geo object (see above)", - "type": "string" - }, - "color_by": { - "description": "Default color by", - "$comment": "The value here must be present in the colorings object (see above)", - "type": "string" - }, - "distance_measure": { - "description": "Default tree metric", - "type": "string", - "enum": ["div", "num_date"] - }, - "layout": { - "description": "Default tree layout", - "type": "string", - "enum": ["rect", "radial", "unrooted", "clock"] - }, - "map_triplicate": { - "description": "Should the map be extended / wrapped around. Useful if transmissions are worldwide.", - "type": "boolean" - }, - "branch_label": { - "description": "What branch label should be displayed by default, or 'none' to hide labels by default.", - "$comment": "Should be a key present in the per-node branch_attrs.labels object of the exported JSON; pattern is from the schema for that object", - "type": "string", - "pattern": "^(none|[a-zA-Z0-9]+)$" - }, - "transmission_lines": { - "description": "Should transmission lines (if available) be displaye by default", - "type": "boolean" - }, - "language": { - "type": "string", - "minLength": 1, - "description": "A BCP 47 language tag specifying the default language in which to display Auspice's interface (if supported)" - }, - "sidebar": { - "type": "string", - "enum": ["open", "closed"] - }, - "panels": { - "type": "array", - "description": "The panels to display by default.", - "$comment": "optional, but if present must not be empty", - "minItems": 1, - "items": { - "type": "string", - "enum": ["tree", "map", "frequencies", "entropy"] - } - } - } + "$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/display_defaults" }, "data_provenance": { - "description": "Specify provenance of data included in this analysis", - "type": "array", - "minItems": 1, - "items": { - "type": "object", - "description": "An individual data source", - "additionalProperties": false, - "required": ["name"], - "properties": { - "name": { - "description": "Name of the data source", - "type": "string" - }, - "url": { - "description": "URL to use in link to data source", - "type": "string" - } - } - } + "$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/data_provenance" } } }, diff --git a/augur/validate.py b/augur/validate.py index c052ed6c6..e8e316257 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -192,6 +192,7 @@ def export_v2(main_json, **kwargs): refs = { 'https://nextstrain.org/schemas/augur/annotations': "schema-annotations.json", 'https://nextstrain.org/schemas/dataset/root-sequence': "schema-export-root-sequence.json", + 'https://nextstrain.org/schemas/auspice/config/v2': "schema-auspice-config-v2.json", } main_schema = load_json_schema("schema-export-v2.json", refs) From 6ce1bf70a4ff7ccfed3e3dde04f5c21c2c80c78b Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Thu, 31 Aug 2023 10:35:41 +1200 Subject: [PATCH 3/9] [schema] Allow default tip labels Auspice can already set the tip label via URL state (`?tl=...`) and will shortly be able to parse the display_default added here. Closes #1115 --- augur/data/schema-auspice-config-v2.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/augur/data/schema-auspice-config-v2.json b/augur/data/schema-auspice-config-v2.json index 87970092c..821f9ee77 100644 --- a/augur/data/schema-auspice-config-v2.json +++ b/augur/data/schema-auspice-config-v2.json @@ -185,6 +185,11 @@ "type": "string", "pattern": "^(none|[a-zA-Z0-9]+)$" }, + "tip_label": { + "description": "What tip label should be displayed by default, or 'none' to hide labels by default.", + "$comment": "Should be a key present in (at least some) node_attrs", + "type": "string" + }, "transmission_lines": { "$comment": "Transmission lines depend on the geo_resolution being defined for internal nodes", "type": "boolean" From cee69bc8fa18392b69dbdde52d7645ecae797e2f Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Thu, 31 Aug 2023 11:06:46 +1200 Subject: [PATCH 4/9] [schema] allow empty colorings definitions These work fine in Auspice. While the 'colorings' property is optional, `augur export v2` will always set a (possibly empty) array. I also chose to allow the auspice config file to have an empty colorings definition, which in practice behaves the same as leaving it out. Addresses comment in #273 --- augur/data/schema-auspice-config-v2.json | 1 - augur/data/schema-export-v2.json | 1 - tests/functional/export_v2/cram/minimal.t | 10 ++++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/augur/data/schema-auspice-config-v2.json b/augur/data/schema-auspice-config-v2.json index 821f9ee77..5b6ed22f1 100644 --- a/augur/data/schema-auspice-config-v2.json +++ b/augur/data/schema-auspice-config-v2.json @@ -14,7 +14,6 @@ "colorings": { "description": "Traits available as color-by options", "type": "array", - "minItems": 1, "items": { "type": "object", "description": "Each object here is an individual coloring, which will populate the sidebar dropdown in auspice", diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json index ea80ce797..0b780be1b 100644 --- a/augur/data/schema-export-v2.json +++ b/augur/data/schema-export-v2.json @@ -95,7 +95,6 @@ }, "colorings": { "type": "array", - "minItems": 1, "items": { "allOf": [ {"$ref": "https://nextstrain.org/schemas/auspice/config/v2#/properties/colorings/items"} diff --git a/tests/functional/export_v2/cram/minimal.t b/tests/functional/export_v2/cram/minimal.t index 543916e80..f3175744c 100644 --- a/tests/functional/export_v2/cram/minimal.t +++ b/tests/functional/export_v2/cram/minimal.t @@ -7,8 +7,14 @@ Minimal export $ ${AUGUR} export v2 \ > --tree "$TESTDIR/../data/tree.nwk" \ > --node-data "$TESTDIR/../data/div_node-data.json" \ - > --output minimal.json &>/dev/null - [2] + > --output minimal.json + WARNING: You didn't provide information on who is maintaining this analysis. + + Validating produced JSON + Validating schema of 'minimal.json'... + Validating that the JSON is internally consistent... + Validation of 'minimal.json' succeeded. + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.json \ > --exclude-paths "root['meta']['updated']" From 8209dadbe3efe3d859383fa744de3ab320b25460 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Thu, 31 Aug 2023 11:28:45 +1200 Subject: [PATCH 5/9] [schema] improve mutations Removes previously valid string patterns which were never used within augur and would result in unexpected behaviour in auspice. Also updates the patternProperties of CDSs to match that used in the genome_annotations (schema-annotations.json) --- augur/data/schema-export-v2.json | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json index 0b780be1b..444fbcf43 100644 --- a/augur/data/schema-export-v2.json +++ b/augur/data/schema-export-v2.json @@ -292,27 +292,22 @@ } }, "mutations": { - "description": "Mutations occuring between the parent and this node", - "$comment": "same numbering scheme as used by the meta.JSON -> annotations", - "$comment": "combining nuc + AAs parallels the metadata -> annotations structure", + "description": "Mutations on the branch leading to this node. 1-based numbering (same as genome_annotations)", "type": "object", "additionalProperties": false, "properties": { "nuc": { - "description": "nucelotide mutations", + "description": "nucleotide mutations", "type": "array", "items": { - "oneOf": [ - {"type": "string", "pattern": "^[ATCGNYRWSKMDVHB-][0-9]+[ATCGNYRWSKMDVHB-]$"}, - {"type": "string", "pattern": "^insertion [0-9]+-[0-9]+$", "$comment": "TODO unused by auspice"}, - {"type": "string", "pattern": "^deletion [0-9]+-[0-9]+$", "$comment": "TODO unused by auspice"} - ] + "type": "string", + "pattern": "^[ATCGNYRWSKMDVHB-][0-9]+[ATCGNYRWSKMDVHB-]$" } } }, "patternProperties": { - "^[a-zA-Z0-9_-]+$": { - "description": "Amino acid mutations for this gene (or annotated region)", + "^(?!nuc)[a-zA-Z0-9*_-]+$": { + "description": "Amino acid mutations for this CDS", "$comment": "properties must exist in the meta.JSON -> annotation object", "type": "array", "items": { From 58fed99d7256fc350e31d7f6b888d809a354c20c Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Thu, 31 Aug 2023 14:17:06 +1200 Subject: [PATCH 6/9] [schema] Enforce some distance measure on trees Trees could currently be produced with neither "div" nor "num_date" information. Arguably Auspice could interpret these as cladograms but as it stands these datasets aren't rendered by Auspice. Datasets without this information are easy to create but rare in practice: If the node-data files don't define "mutation_length" or "branch_length" then there's no "div" and if they don't define "num_date" then that's not there either. Note that what we really want to require is that "div" is present on all nodes and/or "num_date" is present on all nodes, but the schema doesn't let us do this. --- augur/data/schema-export-v2.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json index 444fbcf43..94e71859d 100644 --- a/augur/data/schema-export-v2.json +++ b/augur/data/schema-export-v2.json @@ -130,7 +130,7 @@ "type" : "object", "$comment": "The phylogeny in a nested JSON structure", "additionalProperties": false, - "required": ["name"], + "required": ["name", "node_attrs"], "properties": { "name": { "description": "Strain name. Must be unique. No spaces", @@ -139,6 +139,10 @@ "node_attrs": { "description": "attributes associated with the node (sequence, date, location) as opposed to changes from one node to another.", "type": "object", + "anyOf": [ + {"required": ["div"]}, + {"required": ["num_date"]} + ], "properties": { "div": { "description": "Node (phylogenetic) divergence", From 4582a1e9c9ea09589e6c42605cf587c2573307bd Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Thu, 31 Aug 2023 15:41:40 +1200 Subject: [PATCH 7/9] [export] Node-data files are optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allows a minimal `augur export` using only a (newick) tree as input, functionality that we've wanted for over 4 years! To facilitate this we parse branch lengths¹ from the newick file if such data wasn't available in the node-data inputs (e.g. because there are none!). The code for deciding where to read divergence from has been refactored and in the process improved: the (rare? never encountered?) case where divergence was sometimes read from node-data keys 'mutation_length' and sometimes from 'branch_length' can non longer happen. If data is provided which doesn't define divergence or num_date (irregardless of whether node-data files were provided as inputs), then the resulting dataset will fail validation. Closes #273 ¹ I suppose these might represent time in certain cases, but I haven't seen such data in Newick files. --- augur/export_v2.py | 94 ++++++++++++++--------- tests/functional/export_v2/cram/minimal.t | 29 ++++++- tests/test_validate_export.py | 4 +- 3 files changed, 86 insertions(+), 41 deletions(-) diff --git a/augur/export_v2.py b/augur/export_v2.py index 813e4302e..6e9daedf4 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -110,44 +110,63 @@ def order_nodes(node): order_nodes(od['tree']) return od -def convert_tree_to_json_structure(node, metadata, div=0): + +def node_div(T, node_attrs): + """ + Scans the provided tree & metadata to see if divergence is defined, and if so returns + a function which gets it from individual nodes. Divergence may be defined via a number + of sources, and we pick them in the following order: + * metadata.mutation_length (typically via `augur refine`) + * metadata.branch_length (typically via `augur refine`) + * Branch lengths encoded in the Newick tree + + Returns either: + * function with arguments: (node, metadata_for_node) which returns the node divergence + * None (indicates that divergence is not available for this dataset) + """ + if all(('mutation_length' in node_attrs[n.name] for n in T.root.find_clades())): + return lambda node, metadata: metadata['mutation_length'] + if all(('branch_length' in node_attrs[n.name] for n in T.root.find_clades())): + return lambda node, metadata: metadata['branch_length'] + if T.root.branch_length is not None: + return lambda node, metadata: node.branch_length + return None + +def convert_tree_to_json_structure(node, metadata, get_div, div=0): """ converts the Biopython tree structure to a dictionary that can be written to file as a json. This is called recursively. - Creates the name property & divergence on each node - - input - node -- node for which top level dict is produced. - div -- cumulative divergence (root = 0). False → divergence won't be exported. - returns - tree in JSON structure - list of strains + Parameters + ---------- + node : Bio.Phylo.Newick.Clade + metadata : dict + Per-node metadata, with keys matching `node.name` + get_div : + (None or function) + Function returns divergence for this node. Arguments: (node, metadata_for_node) + If None then divergence is not defined for this dataset and so 'div' is not set on returned nodes. + div : int + cumulative divergence leading to the current node (root = 0) + + Returns + ------- + dict: + See schema-export-v2.json#/$defs/tree for full details. + Node names are always set, and divergence is set if applicable """ - - # Does the tree have divergence? (BEAST trees may not) - # only calculate this for the root node! - if div == 0 and 'mutation_length' not in metadata[node.name] and 'branch_length' not in metadata[node.name]: - div = False - node_struct = {'name': node.name, 'node_attrs': {}, 'branch_attrs': {}} - if div is not False: # div=0 is ok + + if get_div is not None: # Store the (cumulative) observed divergence prior to this node node_struct["node_attrs"]["div"] = div if node.clades: node_struct["children"] = [] for child in node.clades: - if div is False: - cdiv=False - else: - if 'mutation_length' in metadata[child.name]: - cdiv = div + metadata[child.name]['mutation_length'] - elif 'branch_length' in metadata[child.name]: - cdiv = div + metadata[child.name]['branch_length'] - else: - print("ERROR: Cannot find branch length information for %s"%(child.name)) - - node_struct["children"].append(convert_tree_to_json_structure(child, metadata, div=cdiv)) + cdiv = div + if get_div: + cdiv += get_div(child, metadata[child.name]) + node_struct["children"].append(convert_tree_to_json_structure(child, metadata, get_div, div=cdiv)) return node_struct @@ -827,10 +846,9 @@ def register_parser(parent_subparsers): title="REQUIRED" ) required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`") - required.add_argument('--node-data', metavar="JSON", required=True, nargs='+', action="extend", help="JSON files containing metadata for nodes in the tree") - required.add_argument('--output', metavar="JSON", required=True, help="Ouput file (typically for visualisation in auspice)") + required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)") - config = parser.add_argument_group( + config = parser.add_argument_group( title="DISPLAY CONFIGURATION", description="These control the display settings for auspice. \ You can supply a config JSON (which has all available options) or command line arguments (which are more limited but great to get started). \ @@ -848,6 +866,7 @@ def register_parser(parent_subparsers): optional_inputs = parser.add_argument_group( title="OPTIONAL INPUT FILES" ) + optional_inputs.add_argument('--node-data', metavar="JSON", nargs='+', action="extend", help="JSON files containing metadata for nodes in the tree") optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree") optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") @@ -1067,11 +1086,14 @@ def run(args): data_json = {"version": "v2", "meta": {"updated": time.strftime('%Y-%m-%d')}} #load input files - try: - node_data_file = read_node_data(args.node_data, validation_mode=args.validation_mode) # node_data_files is an array of multiple files (or a single file) - except FileNotFoundError: - print(f"ERROR: node data file ({args.node_data}) does not exist") - sys.exit(2) + if args.node_data is not None: + try: + node_data_file = read_node_data(args.node_data, validation_mode=args.validation_mode) # node_data_files is an array of multiple files (or a single file) + except FileNotFoundError: + print(f"ERROR: node data file ({args.node_data}) does not exist") + sys.exit(2) + else: + node_data_file = {'nodes': {}} if args.metadata is not None: try: @@ -1131,7 +1153,7 @@ def run(args): set_filters(data_json, config) # set tree structure - data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs) + data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs)) set_node_attrs_on_tree(data_json, node_attrs) set_branch_attrs_on_tree(data_json, branch_attrs) diff --git a/tests/functional/export_v2/cram/minimal.t b/tests/functional/export_v2/cram/minimal.t index f3175744c..a423a393a 100644 --- a/tests/functional/export_v2/cram/minimal.t +++ b/tests/functional/export_v2/cram/minimal.t @@ -2,11 +2,10 @@ Setup $ source "$TESTDIR"/_setup.sh -Minimal export +Minimal export -- single input (tree) and single output (dataset JSON) $ ${AUGUR} export v2 \ > --tree "$TESTDIR/../data/tree.nwk" \ - > --node-data "$TESTDIR/../data/div_node-data.json" \ > --output minimal.json WARNING: You didn't provide information on who is maintaining this analysis. @@ -16,7 +15,31 @@ Minimal export Validation of 'minimal.json' succeeded. - $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.json \ +The above minimal.json takes divergence from the newick file. This converts newick divergences of (e.g.) '1' to `1.0` +because BioPython uses floats (which is perfectly reasonable). Remove the decimal to diff the JSON. +(Note that Auspice won't behave any differently) + $ sed 's/\.0//' minimal.json > minimal.no-decimal.json + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.no-decimal.json \ + > --exclude-paths "root['meta']['updated']" + {} + +Almost minimal export -- divergence is encoded via the node-data JSON typically produced by `augur refine` + + $ ${AUGUR} export v2 \ + > --tree "$TESTDIR/../data/tree.nwk" \ + > --node-data "$TESTDIR/../data/div_node-data.json" \ + > --output almost-minimal.json + WARNING: You didn't provide information on who is maintaining this analysis. + + Validating produced JSON + Validating schema of 'almost-minimal.json'... + Validating that the JSON is internally consistent... + Validation of 'almost-minimal.json' succeeded. + + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" almost-minimal.json \ > --exclude-paths "root['meta']['updated']" {} diff --git a/tests/test_validate_export.py b/tests/test_validate_export.py index 5cafc92b1..05d8ef025 100644 --- a/tests/test_validate_export.py +++ b/tests/test_validate_export.py @@ -17,14 +17,14 @@ def test_export_without_duplicate_names(self): # Create a tree with unique tip names. tree = Bio.Phylo.read(StringIO("root(A, internal(B, C))"), "newick") metadata = {"A": {}, "B": {}, "C": {}, "root": {}, "internal": {}} - root = convert_tree_to_json_structure(tree.root, metadata) + root = convert_tree_to_json_structure(tree.root, metadata, None) ensure_no_duplicate_names(root, ValidateError) def test_export_with_duplicate_names(self): # Create a tree with duplicate tip names. tree = Bio.Phylo.read(StringIO("root(A, internal(B, B))"), "newick") metadata = {"A": {}, "B": {}, "root": {}, "internal": {}} - root = convert_tree_to_json_structure(tree.root, metadata) + root = convert_tree_to_json_structure(tree.root, metadata, None) with pytest.raises(ValidateError): ensure_no_duplicate_names(root, ValidateError) From 22fe6c61c81cbf16c3643ce0d9ddffded1b3c0c7 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 19 Sep 2023 09:39:26 -0700 Subject: [PATCH 8/9] Optionally ignore numeric type changes Adds a flag to the diff JSONs script to ignore numeric type changes when running DeepDiff [1]. Updates the export v2 minimal export test to use this new flag instead of creating an intermediate file with sed. [1] https://zepworks.com/deepdiff/current/ignore_types_or_values.html --- scripts/diff_jsons.py | 2 ++ tests/functional/export_v2/cram/minimal.t | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/diff_jsons.py b/scripts/diff_jsons.py index ece69de4d..ce215e855 100644 --- a/scripts/diff_jsons.py +++ b/scripts/diff_jsons.py @@ -15,6 +15,7 @@ parser.add_argument("--significant-digits", type=int, default=5, help="number of significant digits to use when comparing numeric values") parser.add_argument("--exclude-paths", nargs="+", help="list of paths to exclude from consideration when performing a diff", default=["root['generated_by']['version']"]) parser.add_argument("--exclude-regex-paths", nargs="+", help="list of path regular expressions to exclude from consideration when performing a diff") + parser.add_argument("--ignore-numeric-type-changes", action="store_true", help="ignore numeric type changes in the diff (e.g., int of 1 to float of 1.0)") args = parser.parse_args() @@ -31,5 +32,6 @@ significant_digits=args.significant_digits, exclude_paths=args.exclude_paths, exclude_regex_paths=args.exclude_regex_paths, + ignore_numeric_type_changes=args.ignore_numeric_type_changes, ) ) diff --git a/tests/functional/export_v2/cram/minimal.t b/tests/functional/export_v2/cram/minimal.t index a423a393a..9fd1c1eda 100644 --- a/tests/functional/export_v2/cram/minimal.t +++ b/tests/functional/export_v2/cram/minimal.t @@ -16,12 +16,10 @@ Minimal export -- single input (tree) and single output (dataset JSON) The above minimal.json takes divergence from the newick file. This converts newick divergences of (e.g.) '1' to `1.0` -because BioPython uses floats (which is perfectly reasonable). Remove the decimal to diff the JSON. +because BioPython uses floats (which is perfectly reasonable). Ignore this type change in the JSON diff. (Note that Auspice won't behave any differently) - $ sed 's/\.0//' minimal.json > minimal.no-decimal.json - - $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/minimal.json" minimal.no-decimal.json \ + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" --ignore-numeric-type-changes "$TESTDIR/../data/minimal.json" minimal.json \ > --exclude-paths "root['meta']['updated']" {} From 9afa278c1ad50d90ce7ada3e224f09eae15cddc3 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Wed, 20 Sep 2023 16:34:37 +1200 Subject: [PATCH 9/9] changelog --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 19dabb3e1..2cf2ff234 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,8 +5,15 @@ ### Features * Support treetime 0.11.* [#1310][] (@corneliusroemer) +* Allow minimal export using only a (newick) tree in `augur export v2`. [#1299][] (@jameshadfield) +* A number of schema updates and improvements [#1299][] (@jameshadfield) + * We now require all nodes to have `node_attrs` on them with one of `div` or `num_date` present + * Some never-used properties are removed from the schemas, including a pattern for defining nucleotide INDELs which was never used by augur or auspice. + * Tip label defaults are now settable within the auspice-config JSON + * Empty colorings definitions are allowed (the tree will be grey in Auspice) [#1310]: https://github.com/nextstrain/augur/pull/1310 +[#1299]: https://github.com/nextstrain/augur/pull/1299 ## 23.0.0 (5 September 2023)