diff --git a/CHANGES.md b/CHANGES.md index 9777a99e5..a409d4d66 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,7 +6,7 @@ * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield) * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. * `--vcf-reference` and `--root-sequence` are now mutually exclusive. - +* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield) * translate: Improvements to command line arguments. [#1348][] (@jameshadfield) * `--tree` and `--ancestral-sequences` are now required arguments. * separate VCF-only arguments into their own group diff --git a/augur/translate.py b/augur/translate.py index 461738b29..1fc73040f 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -20,6 +20,7 @@ from .utils import read_node_data, load_features, write_json, get_json_name from treetime.vcf_utils import read_vcf from augur.errors import AugurError +from textwrap import dedent class MissingNodeError(Exception): pass @@ -339,7 +340,7 @@ def sequences_json(node_data_json, tree): Extract the full nuc sequence for each node in the provided node-data JSON. Returns a dict, keys are node names and values are a string of the genome sequence (nuc) """ - node_data = read_node_data(node_data_json, tree) + node_data = read_node_data(node_data_json) if node_data is None: raise AugurError("could not read node data (incl sequences)") # extract sequences from node meta data @@ -347,6 +348,14 @@ def sequences_json(node_data_json, tree): for k,v in node_data['nodes'].items(): if 'sequence' in v: sequences[k] = v['sequence'] + tree_nodes = {c.name for c in tree.find_clades()} + tree_nodes_missing_sequences = tree_nodes - set(sequences.keys()) + if len(tree_nodes_missing_sequences): + raise AugurError(dedent(f"""\ + {len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON. + These must be present under 'nodes' → → 'sequence'. + This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here. + """)) return sequences def register_parser(parent_subparsers): @@ -412,7 +421,7 @@ def run(args): if len(features_without_variation): print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation))) else: - sequences = sequences_json(args.ancestral_sequences, args.tree) + sequences = sequences_json(args.ancestral_sequences, tree) translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'} ## glob the annotations for later auspice export