From 5608714a9037b9162b313670646502ca4f0c5182 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 4 Jun 2024 22:38:13 +0200 Subject: [PATCH] feat: warn when standalone ref and tree ref don't match exactly This is the same check as in https://github.com/nextstrain/nextclade/pull/1474, but during dataset indexing, to catch it earlier for datasets we control. --- scripts/rebuild | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/rebuild b/scripts/rebuild index e0eb3fd1..a31676eb 100755 --- a/scripts/rebuild +++ b/scripts/rebuild @@ -126,6 +126,7 @@ def index_one_dataset(args, pathogen_json_path: str, dataset: object, tag: str, path = relpath(dataset_dir, args.input_dir) ref = get_ref_seq(pathogen_json, dataset_dir) + check_ref_seq_mismatch(ref, pathogen_json, dataset_dir) versions, last_version = dataset_get_versions(dataset) @@ -163,6 +164,20 @@ def get_ref_seq(pathogen_json, dataset_dir): raise ValueError(f"When reading reference sequence") from e +def check_ref_seq_mismatch(standalone_ref, pathogen_json, dataset_dir): + tree_filename = dict_get(pathogen_json, ["files", "treeJson"]) + tree_json_path = join(dataset_dir, tree_filename) if tree_filename else None + if tree_json_path is not None and isfile(tree_json_path): + tree_json = json_read(tree_json_path) + tree_ref = dict_get(tree_json, ["root_sequence", "nuc"]) + if tree_ref is not None: + if standalone_ref.seq != tree_ref: + l.warn( + "Reference sequence provided does not exactly match reference (root) sequence in Auspice JSON. This warning " + "signals that there is a potential for failures if the mismatch is not intended." + ) + + def get_new_dataset_order(datasets, dataset_order): paths = list(map(lambda d: d["path"], datasets))