From 77be5a7ef027e872f8c28ef6864d20246943ced6 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Nov 2024 20:51:21 +0100
Subject: [PATCH 01/11] fix: explicitly specify how the root and ambiguous
 states are handled during sequence reconstruction and mutation counting

---
 augur/ancestral.py | 14 +++++++-------
 augur/refine.py    |  5 ++++-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/augur/ancestral.py b/augur/ancestral.py
index dad3497bc..9a59eca2c 100644
--- a/augur/ancestral.py
+++ b/augur/ancestral.py
@@ -85,7 +85,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
 
     # only infer ancestral sequences, leave branch length untouched
     tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal,
-                                 reconstruct_tip_states=infer_tips)
+                                 reconstruct_tip_states=infer_tips, sample_from_profile='root')
 
     return tt
 
@@ -104,7 +104,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
         only used if is_vcf
     aln : dict
         describes variation (relative to reference) per sample. Only used if is_vcf.
-        
+
     Returns
     -------
     numpy.ndarray(bool)
@@ -122,7 +122,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
                 if alt=='N':
                     ambig_positions.append(pos)
             # ambig_positions = [pos for pos, alt in sample_data.items() if alt=='N']
-            np.add.at(ambiguous_count, ambig_positions, 1)       
+            np.add.at(ambiguous_count, ambig_positions, 1)
         # Secondly, if the VCF defines no mutations but the ref is "N":
         no_info_sites = np.array(list(reference_sequence)) == 'N'
         no_info_sites[list(variable_sites)] = False
@@ -171,7 +171,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf
 
     # Note that for mutations reported across the tree we don't have to consider
     # the mask, because while sites which are all Ns may have an inferred base,
-    # there will be no variablity and thus no mutations.  
+    # there will be no variablity and thus no mutations.
     for n in tt.tree.find_clades():
         data[n.name] = [a+str(int(pos)+inc)+cm(d)
                         for a,pos,d in n.mutations]
@@ -185,7 +185,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf
                 data[tt.tree.root.name].append(f"{root_state}{pos+1}{tree_state}")
 
     return data
-        
+
 def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
     """
     Create a full sequence for every node on the tree. Masked positions will
@@ -198,7 +198,7 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
         instance of treetime with valid ancestral reconstruction
     mask : numpy.ndarray(bool)
         Mask these positions by changing them to the ambiguous nucleotide
-    reference_sequence : str or None 
+    reference_sequence : str or None
     infer_ambiguous : bool, optional
         if true, request the reconstructed sequences from treetime, otherwise retain input ambiguities
 
@@ -437,7 +437,7 @@ def run(args):
             features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
             raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
                 f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")
-        
+
         print("Read in {} features from reference sequence file".format(len(features)))
         for gene in genes:
             print(f"Processing gene: {gene}")
diff --git a/augur/refine.py b/augur/refine.py
index 158c86afd..408ec495e 100644
--- a/augur/refine.py
+++ b/augur/refine.py
@@ -313,7 +313,10 @@ def run(args):
         pass
     elif args.divergence_units=='mutations':
         if not args.timetree:
-            tt.infer_ancestral_sequences()
+            # infer ancestral sequence for the purpose of counting mutations
+            # sample mutations from the root profile, otherwise use most likely state.
+            # Reconstruct tip states to avoid mutations to N or W etc be counted
+            tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states='True', sample_from_profile='root')
         nuc_map = profile_maps['nuc']
 
         def are_sequence_states_different(nuc1, nuc2):

From 3adf44b13eee6e3b4055ed3f63f928028edf11d3 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Nov 2024 20:57:29 +0100
Subject: [PATCH 02/11] add to changelog

---
 CHANGES.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index aaf8c06cb..f9497381b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -4,9 +4,12 @@
 
 ### Bug Fixes
 
+* Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. Fixes issue [#1689][] (@rneher) and was resolved in PR [#1690][].
 * titers: Fix type errors in code associated with cross-validation of models. [#1688][] (@huddlej)
 
 [#1688]: https://github.com/nextstrain/augur/pull/1688
+[#1689]: https://github.com/nextstrain/augur/pull/1689
+[#1690]: https://github.com/nextstrain/augur/pull/1690
 
 ## 27.0.0 (9 December 2024)
 

From 22fa17f5dd44719e894e37db0517a8fac2e4ad1f Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Wed, 11 Dec 2024 21:40:14 +0100
Subject: [PATCH 03/11] fix: change string 'True' to bool

Co-authored-by: Victor Lin <13424970+victorlin@users.noreply.github.com>
---
 augur/refine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/refine.py b/augur/refine.py
index 408ec495e..c5cbdfe67 100644
--- a/augur/refine.py
+++ b/augur/refine.py
@@ -316,7 +316,7 @@ def run(args):
             # infer ancestral sequence for the purpose of counting mutations
             # sample mutations from the root profile, otherwise use most likely state.
             # Reconstruct tip states to avoid mutations to N or W etc be counted
-            tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states='True', sample_from_profile='root')
+            tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states=True, sample_from_profile='root')
         nuc_map = profile_maps['nuc']
 
         def are_sequence_states_different(nuc1, nuc2):

From 2b862de0cf65d9fa325f6f55efec1cf4344fa1dc Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 17 Dec 2024 13:58:52 -0800
Subject: [PATCH 04/11] Add and use random seed interface for ancestral

Adds a new `--seed` argument to the ancestral command, following the
existing pattern in the refine command, and passes the user-provided
value to TreeTime's `TreeAnc` class to ensure deterministic outcomes for
random samples. Since this PR changes ancestral's root sequence
inference to be stochastic, this new feature allows us to get the same
"random" result with each run of our functional tests.

Correspondingly, this commit updates the non-VCF functional tests of
ancestral to use a fixed seed. This seed should fix the random behavior
to a specific outcome, but we need to update our expected JSON outputs
to match that outcome here. In this case, the change is an introduction
of a "T" at position 14 in the node_root of the "simple genome" data.
---
 augur/ancestral.py                               | 16 ++++++++++------
 tests/functional/ancestral/cram/case-sensitive.t |  2 ++
 tests/functional/ancestral/cram/general.t        |  2 ++
 .../data/simple-genome/nt_muts.no-ref-seq.json   | 11 ++++++-----
 .../data/simple-genome/nt_muts.ref-seq.json      | 10 ++++++----
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/augur/ancestral.py b/augur/ancestral.py
index 9a59eca2c..b1cb84f60 100644
--- a/augur/ancestral.py
+++ b/augur/ancestral.py
@@ -39,7 +39,7 @@
 
 def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
                                  marginal=False, fill_overhangs=True, infer_tips=False,
-                                 alphabet='nuc'):
+                                 alphabet='nuc', rng_seed=None):
     """infer ancestral sequences using TreeTime
 
     Parameters
@@ -68,6 +68,8 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
         alphabet to use for ancestral sequence inference. Default is the nucleotide
         alphabet that included a gap character 'nuc'. Alternative is `aa` for amino
         acids.
+    rng_seed : int, optional
+        random seed value to use for inference
 
     Returns
     -------
@@ -78,7 +80,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
     from treetime import TreeAnc
 
     tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', alphabet=alphabet,
-                 fill_overhangs=fill_overhangs, verbose=1)
+                 fill_overhangs=fill_overhangs, verbose=1, rng_seed=rng_seed)
 
     # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
     bool_marginal = (marginal == "marginal")
@@ -227,14 +229,14 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
     return sequences
 
 def run_ancestral(T, aln, reference_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
-                  infer_ambiguous=False, marginal=False, alphabet='nuc'):
+                  infer_ambiguous=False, marginal=False, alphabet='nuc', rng_seed=None):
     """
     ancestral nucleotide reconstruction using TreeTime
     """
 
     tt = ancestral_sequence_inference(tree=T, aln=aln, ref=reference_sequence if is_vcf else None, marginal=marginal,
                                       fill_overhangs = fill_overhangs, alphabet=alphabet,
-                                      infer_tips = infer_ambiguous)
+                                      infer_tips = infer_ambiguous, rng_seed=rng_seed)
 
     character_map = {}
     for x in tt.gtr.profile_map:
@@ -295,6 +297,7 @@ def register_parser(parent_subparsers):
     )
     global_options_group.add_argument('--inference', default='joint', choices=["joint", "marginal"],
                                     help="calculate joint or marginal maximum likelihood ancestral sequence states")
+    global_options_group.add_argument('--seed', type=int, help="seed for random number generation")
 
     nucleotide_options_group = parser.add_argument_group(
         "nucleotide options",
@@ -416,7 +419,8 @@ def run(args):
     infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous
     full_sequences = not is_vcf
     nuc_result = run_ancestral(T, aln, reference_sequence=ref if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
-                               full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc')
+                               full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc',
+                               rng_seed=args.seed)
     anc_seqs = nuc_result['mutations']
     anc_seqs['reference'] = {'nuc': nuc_result['root_seq']}
 
@@ -446,7 +450,7 @@ def run(args):
             reference_sequence = str(feat.extract(Seq(ref)).translate()) if ref else None
 
             aa_result = run_ancestral(T, fname, reference_sequence=reference_sequence, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
-                                        marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
+                                        marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa', rng_seed=args.seed)
             len_translated_alignment = aa_result['tt'].data.full_length*3
             if len_translated_alignment != len(feat):
                 raise AugurError(f"length of translated alignment for {gene} ({len_translated_alignment})"
diff --git a/tests/functional/ancestral/cram/case-sensitive.t b/tests/functional/ancestral/cram/case-sensitive.t
index fbe35df75..2df0b825d 100644
--- a/tests/functional/ancestral/cram/case-sensitive.t
+++ b/tests/functional/ancestral/cram/case-sensitive.t
@@ -11,6 +11,7 @@ Change the _reference_ to lowercase
   >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
   >  --root-sequence ref.lower.fasta \
   >  --output-node-data "nt_muts.ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
@@ -33,6 +34,7 @@ be lowecase which will be compared against the uppercase reference
   >  --alignment sequences.lower.fasta \
   >  --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
   >  --output-node-data "nt_muts.ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
diff --git a/tests/functional/ancestral/cram/general.t b/tests/functional/ancestral/cram/general.t
index 9591db36a..1f68d8a9a 100644
--- a/tests/functional/ancestral/cram/general.t
+++ b/tests/functional/ancestral/cram/general.t
@@ -14,6 +14,7 @@ node-data JSON we diff against.
   >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
   >  --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
   >  --output-node-data "nt_muts.ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 
@@ -34,6 +35,7 @@ mutations (as there's nothing to compare the root node to)
   >  --tree "$TESTDIR/../data/simple-genome/tree.nwk" \
   >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
   >  --output-node-data "nt_muts.no-ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 
diff --git a/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json b/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json
index 402055aa5..69e72ee12 100644
--- a/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json
+++ b/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json
@@ -16,7 +16,6 @@
     "node_AB": {
       "muts": [
         "A7G",
-        "C14T",
         "T18C",
         "A33G",
         "A43T"
@@ -25,7 +24,7 @@
     },
     "node_root": {
       "muts": [],
-      "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
+      "sequence": "AAAACAAAAATGCTCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
     },
     "sample_A": {
       "muts": [
@@ -41,11 +40,13 @@
       "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA"
     },
     "sample_C": {
-      "muts": [],
+      "muts": [
+        "T14C"
+      ],
       "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
     }
   },
   "reference": {
-    "nuc": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
+    "nuc": "AAAACAAAAATGCTCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
   }
-}
\ No newline at end of file
+}
diff --git a/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json
index b059b5647..dd92237cc 100644
--- a/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json
+++ b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json
@@ -12,7 +12,6 @@
     "node_AB": {
       "muts": [
         "A7G",
-        "C14T",
         "T18C",
         "A33G",
         "A43T"
@@ -22,9 +21,10 @@
     "node_root": {
       "muts": [
         "A5C",
+        "C14T",
         "C18T"
       ],
-      "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
+      "sequence": "AAAACAAAAATGCTCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
     },
     "sample_A": {
       "muts": [
@@ -40,11 +40,13 @@
       "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA"
     },
     "sample_C": {
-      "muts": [],
+      "muts": [
+        "T14C"
+      ],
       "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
     }
   },
   "reference": {
     "nuc": "AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA"
   }
-}
\ No newline at end of file
+}

From 0da5c11011b50e641a1a206474631c6fbf67981c Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 17 Dec 2024 14:04:23 -0800
Subject: [PATCH 05/11] Use random seed in simple VCF test of ancestral

As in the previous commit, this commit updates the expected VCF output
to match the fixed "random" outcome for the given seed where position 14
is a T in the root node.
---
 tests/functional/ancestral/cram/vcf.t | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/functional/ancestral/cram/vcf.t b/tests/functional/ancestral/cram/vcf.t
index ae234d7f5..d1f2f3f67 100644
--- a/tests/functional/ancestral/cram/vcf.t
+++ b/tests/functional/ancestral/cram/vcf.t
@@ -15,6 +15,7 @@ but it will have the reference sequence attached.
   >  --vcf-reference $DATA/reference.fasta \
   >  --output-node-data "nt_muts.vcf-input.ref-seq.json" \
   >  --output-vcf nt_muts.vcf \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 
@@ -33,7 +34,7 @@ here as it's not relevant to what I'm trying to test.
   > #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	node_root	sample_C	node_AB	sample_B	sample_A
   > 1	5	.	A	C	.	PASS	.	GT	1	1	1	1	1
   > 1	7	.	A	G	.	PASS	.	GT	0	0	1	1	1
-  > 1	14	.	C	T	.	PASS	.	GT	0	0	1	1	1
+  > 1	14	.	C	T	.	PASS	.	GT	1	0	1	1	1
   > 1	18	.	C	T	.	PASS	.	GT	1	1	0	0	0
   > 1	33	.	A	C,G	.	PASS	.	GT	0	0	2	2	1
   > 1	39	.	C	T	.	PASS	.	GT	0	0	0	0	1

From e0a9a1cc0f536ca0be189b354619a6ca7ec52f36 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 17 Dec 2024 14:06:48 -0800
Subject: [PATCH 06/11] Use random seed for multiallele VCF ancestral test

Updates the functional test of multiallele VCF behavior when one of the
alleles is a "N" character. The multiallele VCF in the original
test (setting sample B to genotype of 30G) caused the inference of
alleles at sites 7 and 14 to change substantially in this PR's new
implementation. This commit stabilizes the test by assigning both
samples A and B to the 30G genotype such that all nodes are inferred to
have a 30G and the root node gets a A30G mutation. It should maintain
the originally intended behavior of the test [1].

As part of this change, I've replaced the dynamically modified VCF used
by this test with a static version of the VCF in version control. This
change simplified test debugging by allowing me to inspect the input
file outside of the cram temporary environment and it also fixes an
issue with the original dynamic VCF where an additional line for site 30
was added but the original line was not being removed.

[1] https://github.com/nextstrain/augur/issues/1380
---
 .../functional/ancestral/cram/vcf-multi-allele.t  | 14 ++++++--------
 .../simple-genome/snps_with_multiple_alleles.vcf  | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 8 deletions(-)
 create mode 100644 tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf

diff --git a/tests/functional/ancestral/cram/vcf-multi-allele.t b/tests/functional/ancestral/cram/vcf-multi-allele.t
index dfa52a7f4..cb26dc8e9 100644
--- a/tests/functional/ancestral/cram/vcf-multi-allele.t
+++ b/tests/functional/ancestral/cram/vcf-multi-allele.t
@@ -4,20 +4,18 @@ Setup
 
   $ export DATA="$TESTDIR/../data/simple-genome"
 
-We take the same `snps.vcf` file used in `general.t` but add another
+We used a modified `snps.vcf` file used in `vcf.t` but with an additional
 allele at site 30 - sample_B has a "G". Since the root is A and this
 is the only sample with G it's 'A30G'.
 See <https://github.com/nextstrain/augur/issues/1380> for the bug this is testing.
 
-  $ sed '11s/^/1\t30\t.\tA\tG,N\t.\t.\t.\tGT\t0\t1\t2\n/' \
-  > "$DATA/snps.vcf" > snps2.vcf
-
   $ ${AUGUR} ancestral \
   >  --tree $DATA/tree.nwk \
-  >  --alignment snps2.vcf \
+  >  --alignment $DATA/snps_with_multiple_alleles.vcf \
   >  --vcf-reference $DATA/reference.fasta \
   >  --output-node-data nt_muts.json \
   >  --output-vcf nt_muts.vcf \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 
@@ -25,15 +23,15 @@ See <https://github.com/nextstrain/augur/issues/1380> for the bug this is testin
   >   "$DATA/nt_muts.ref-seq.json" \
   >   nt_muts.json \
   >   --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]" "root\['generated_by'\]"
-  {'iterable_item_added': {"root['nodes']['sample_B']['muts'][0]": 'A30G'}}
+  {'iterable_item_added': {"root['nodes']['node_root']['muts'][3]": 'A30G'}}
 
   $ cat > expected.vcf <<EOF
   > #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	node_root	sample_C	node_AB	sample_B	sample_A
   > 1	5	.	A	C	.	PASS	.	GT	1	1	1	1	1
   > 1	7	.	A	G	.	PASS	.	GT	0	0	1	1	1
-  > 1	14	.	C	T	.	PASS	.	GT	0	0	1	1	1
+  > 1	14	.	C	T	.	PASS	.	GT	1	0	1	1	1
   > 1	18	.	C	T	.	PASS	.	GT	1	1	0	0	0
-  > 1	30	.	A	G	.	PASS	.	GT	0	0	0	1	0
+  > 1	30	.	A	G	.	PASS	.	GT	1	1	1	1	1
   > 1	33	.	A	C,G	.	PASS	.	GT	0	0	2	2	1
   > 1	39	.	C	T	.	PASS	.	GT	0	0	0	0	1
   > 1	42	.	G	A	.	PASS	.	GT	0	0	0	1	0
diff --git a/tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf b/tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf
new file mode 100644
index 000000000..4a311800d
--- /dev/null
+++ b/tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf
@@ -0,0 +1,15 @@
+##fileformat=VCFv4.3
+##contig=<ID=1,length=50>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample_A	sample_B	sample_C
+1	5	.	A	C	.	.	.	GT	1	1	1
+1	7	.	A	G	.	.	.	GT	1	1	0
+1	14	.	C	T	.	.	.	GT	1	1	0
+1	18	.	C	T	.	.	.	GT	0	0	1
+1	28	.	A	N	.	.	.	GT	0	0	1
+1	29	.	A	N	.	.	.	GT	0	0	1
+1	30	.	A	G,N	.	.	.	GT	1	1	2
+1	33	.	A	C,G	.	.	.	GT	1	2	0
+1	39	.	C	T	.	.	.	GT	1	0	0
+1	42	.	G	A	.	.	.	GT	0	1	0
+1	43	.	A	T	.	.	.	GT	1	1	0

From 10881771e373cf895249fcb9c40cc3bb5c736ad1 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 17 Dec 2024 14:43:35 -0800
Subject: [PATCH 07/11] Update expected branch lengths for functional test

The change to how augur refine can stochastically assign the root node
mutations led to a change in inferred branch lengths for our functional
tests. Since the test uses a random seed argument for augur refine, the
results of subsequent runs should be deterministically successful after
this change.
---
 tests/functional/refine/data/integer_branch_lengths.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional/refine/data/integer_branch_lengths.json b/tests/functional/refine/data/integer_branch_lengths.json
index e577c8787..ee9b99082 100644
--- a/tests/functional/refine/data/integer_branch_lengths.json
+++ b/tests/functional/refine/data/integer_branch_lengths.json
@@ -46,10 +46,10 @@
       "branch_length": 2
     },
     "NODE_0000006": {
-      "branch_length": 11
+      "branch_length": 10
     },
     "NODE_0000007": {
-      "branch_length": 2
+      "branch_length": 3
     },
     "NODE_0000008": {
       "branch_length": 6

From 28dfae9491aee3a7c7548c829dc164f7d2677966 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 17 Dec 2024 16:35:24 -0800
Subject: [PATCH 08/11] Align data for ancestral and translate tests

Updates data for augur translate functional tests to match the updated
test data for the ancestral command.
---
 tests/functional/translate/cram/root-mutations.t |  2 +-
 .../translate/data/simple-genome/aa_muts.json    | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/functional/translate/cram/root-mutations.t b/tests/functional/translate/cram/root-mutations.t
index de9c0f55d..151f36e93 100644
--- a/tests/functional/translate/cram/root-mutations.t
+++ b/tests/functional/translate/cram/root-mutations.t
@@ -28,4 +28,4 @@ is unchanged (MPCG*). There is also a mutation E4G at the root node to compensat
   >   "$DATA/aa_muts.json" \
   >   "aa_muts.json" \
   >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']"
-  {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][0]": 'E4G'}}
\ No newline at end of file
+  {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][1]": 'E4G'}}
diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json
index 375ee7cb3..0fc6298bb 100644
--- a/tests/functional/translate/data/simple-genome/aa_muts.json
+++ b/tests/functional/translate/data/simple-genome/aa_muts.json
@@ -29,9 +29,7 @@
   "nodes": {
     "node_AB": {
       "aa_muts": {
-        "gene1": [
-          "P2L"
-        ],
+        "gene1": [],
         "gene2": [
           "V2D"
         ]
@@ -39,11 +37,13 @@
     },
     "node_root": {
       "aa_muts": {
-        "gene1": [],
+        "gene1": [
+          "P2L"
+        ],
         "gene2": []
       },
       "aa_sequences": {
-        "gene1": "MPCG*",
+        "gene1": "MLCG*",
         "gene2": "MVK*"
       }
     },
@@ -61,7 +61,9 @@
     },
     "sample_C": {
       "aa_muts": {
-        "gene1": [],
+        "gene1": [
+          "L2P"
+        ],
         "gene2": []
       }
     }
@@ -70,4 +72,4 @@
     "gene1": "MPCG*",
     "gene2": "MVK*"
   }
-}
\ No newline at end of file
+}

From bb9603cd0c7fc44e894e2cf20994d5261d7541f1 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 18 Dec 2024 08:58:12 -0800
Subject: [PATCH 09/11] Fix test data for augur translate tests

Updates SNPs in manually-curated VCF used to test augur translate so
they match the new expected genotype for the "simple genome" data at
position 14. This fixes the vcf.t translate test. Also updates the
expected amino acid translations JSON for the vcf with root mutation
test. As part of this update, I've created a complete copy of the
"truth" aa_muts.json with the root mutation and used this instead of the
version that was previously created by modifying the original aa_muts
file with sed. Although this change duplicates test data, I was able to
debug the test more easily with a complete version of the JSON living
outside of the cram tests.
---
 .../translate/cram/vcf-with-root-mutation.t   | 12 ++-
 .../aa_muts_with_root_mutation.json           | 76 +++++++++++++++++++
 .../data/simple-genome/snps-inferred.vcf      |  2 +-
 3 files changed, 82 insertions(+), 8 deletions(-)
 create mode 100644 tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json

diff --git a/tests/functional/translate/cram/vcf-with-root-mutation.t b/tests/functional/translate/cram/vcf-with-root-mutation.t
index 36b4a4f65..ef808cc8b 100644
--- a/tests/functional/translate/cram/vcf-with-root-mutation.t
+++ b/tests/functional/translate/cram/vcf-with-root-mutation.t
@@ -34,14 +34,12 @@ The _reference_ produced is the actual reference, not using the mutations in the
   MVK* (no-eol)
 
 However the aa_mutations should annotate the aa_sequence on the root node as
-having the G3E AA mutation, i.e. MPCE* instead of MPCG*, as well as a
-corresponding AA mutation on the root node G4E (i.e. reference is G, but root
-node is E (and so are all the other nodes))
-
-  $ sed '46s/MPCG/MPCE/' "$DATA/aa_muts.json" |  sed '42s/\[\]/\["G4E"\]/' > aa_muts.truth.json
+having the G4E AA mutation, as well as a corresponding AA mutation on the root
+node G4E (i.e. reference is G, but root node is E (and so are all the other
+nodes)).
 
   $ python3 "$SCRIPTS/diff_jsons.py" \
-  >   aa_muts.truth.json \
+  >   $DATA/aa_muts_with_root_mutation.json \
   >   aa_muts.json \
   >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']"
-  {}
\ No newline at end of file
+  {}
diff --git a/tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json b/tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json
new file mode 100644
index 000000000..794cfe42f
--- /dev/null
+++ b/tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json
@@ -0,0 +1,76 @@
+{
+  "annotations": {
+    "gene1": {
+      "end": 24,
+      "seqid": "data/reference.gff",
+      "start": 10,
+      "strand": "+",
+      "type": "gene"
+    },
+    "gene2": {
+      "end": 47,
+      "seqid": "data/reference.gff",
+      "start": 36,
+      "strand": "-",
+      "type": "gene"
+    },
+    "nuc": {
+      "end": 50,
+      "seqid": "data/reference.gff",
+      "start": 1,
+      "strand": "+",
+      "type": "##sequence-region pragma"
+    }
+  },
+  "generated_by": {
+    "program": "augur",
+    "version": "23.1.1"
+  },
+  "nodes": {
+    "node_AB": {
+      "aa_muts": {
+        "gene1": [],
+        "gene2": [
+          "V2D"
+        ]
+      }
+    },
+    "node_root": {
+      "aa_muts": {
+        "gene1": [
+          "P2L",
+          "G4E"
+        ],
+        "gene2": []
+      },
+      "aa_sequences": {
+        "gene1": "MLCE*",
+        "gene2": "MVK*"
+      }
+    },
+    "sample_A": {
+      "aa_muts": {
+        "gene1": [],
+        "gene2": []
+      }
+    },
+    "sample_B": {
+      "aa_muts": {
+        "gene1": [],
+        "gene2": []
+      }
+    },
+    "sample_C": {
+      "aa_muts": {
+        "gene1": [
+          "L2P"
+        ],
+        "gene2": []
+      }
+    }
+  },
+  "reference": {
+    "gene1": "MPCG*",
+    "gene2": "MVK*"
+  }
+}
diff --git a/tests/functional/translate/data/simple-genome/snps-inferred.vcf b/tests/functional/translate/data/simple-genome/snps-inferred.vcf
index b5a3c571e..e267817b4 100644
--- a/tests/functional/translate/data/simple-genome/snps-inferred.vcf
+++ b/tests/functional/translate/data/simple-genome/snps-inferred.vcf
@@ -5,7 +5,7 @@
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	node_root	node_AB	sample_A	sample_B	sample_C
 1	5	.	A	C	.	.	.	GT	1	1	1	1	1
 1	7	.	A	G	.	.	.	GT	0	1	1	1	0
-1	14	.	C	T	.	.	.	GT	0	1	1	1	0
+1	14	.	C	T	.	.	.	GT	1	1	1	1	0
 1	18	.	C	T	.	.	.	GT	1	0	0	0	1
 1	33	.	A	G,C	.	.	.	GT	0	1	2	1	0
 1	39	.	C	T	.	.	.	GT	0	0	1	0	0

From d39198977aeb63c6bda732e94005c3119d7b2c07 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 18 Dec 2024 09:31:05 -0800
Subject: [PATCH 10/11] Use random seed in all ancestral tests

Adds random seed argument to all functional tests of the ancestral
command. This should hopefully resolve remaining stochastic changes in
cram runs during CI.
---
 tests/functional/ancestral/cram/ambiguous-positions.t  |  2 +-
 .../ancestral/cram/default-nucleotide-reconstruction.t |  1 +
 .../ancestral/cram/infer-ambiguous-nucleotides.t       |  1 +
 .../cram/infer-amino-acid-sequences-genes-file.t       |  1 +
 .../infer-amino-acid-sequences-with-root-sequence.t    |  1 +
 .../ancestral/cram/infer-amino-acid-sequences.t        |  1 +
 tests/functional/ancestral/cram/invalid-args.t         | 10 ++++++++--
 .../ancestral/cram/keep-ambiguous-nucleotides.t        |  1 +
 8 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/functional/ancestral/cram/ambiguous-positions.t b/tests/functional/ancestral/cram/ambiguous-positions.t
index 0fc7b124e..ae82549d7 100644
--- a/tests/functional/ancestral/cram/ambiguous-positions.t
+++ b/tests/functional/ancestral/cram/ambiguous-positions.t
@@ -19,6 +19,7 @@ Setup
   >  --tree "$DATA/tree.nwk" \
   >  --alignment snps.vcf \
   >  --vcf-reference reference.fasta \
+  >  --seed 314159 \
   >  --output-node-data nt_muts.json \
   >  --output-vcf nt_muts.vcf > /dev/null
 
@@ -44,4 +45,3 @@ Setup
   >   --vcf nt_muts.vcf \
   >   --json nt_muts.json \
   >   --ref reference.fasta > /dev/null
-
diff --git a/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t b/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t
index cc38cdbfe..2f3b59bdd 100644
--- a/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t
+++ b/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t
@@ -8,6 +8,7 @@ The default is to infer ambiguous bases, so there should not be N bases in the i
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null
 
diff --git a/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t b/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t
index 96700d336..d6a7e1926 100644
--- a/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t
+++ b/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t
@@ -9,6 +9,7 @@ There should not be N bases in the inferred output sequences.
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --infer-ambiguous \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null
 
diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t
index 09ed75bb4..164a9e45c 100644
--- a/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t
+++ b/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t
@@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences, using a genes file.
   >  --annotation $TESTDIR/../data/zika_outgroup.gb \
   >  --genes $TESTDIR/../data/genes.txt \
   >  --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
+  >  --seed 314159 \
   >  --output-node-data ancestral_mutations.json \
   >  --output-sequences ancestral_sequences.fasta \
   >  --output-translations ancestral_aa_sequences_%GENE.fasta > /dev/null
diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t
index 0d61ca296..333fb0fb0 100644
--- a/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t
+++ b/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t
@@ -15,6 +15,7 @@ ancestor).
   >  --root-sequence $TESTDIR/../data/zika_outgroup.gb \
   >  --genes ENV PRO \
   >  --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" > /dev/null
 
 Check that the reference length was correctly exported as the nuc annotation
diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences.t
index 35804fdf1..d1c3a8f23 100644
--- a/tests/functional/ancestral/cram/infer-amino-acid-sequences.t
+++ b/tests/functional/ancestral/cram/infer-amino-acid-sequences.t
@@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences.
   >  --annotation $TESTDIR/../data/zika_outgroup.gb \
   >  --genes ENV PRO \
   >  --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" \
   >  --output-translations "$CRAMTMP/$TESTFILE/ancestral_aa_sequences_%GENE.fasta" > /dev/null
diff --git a/tests/functional/ancestral/cram/invalid-args.t b/tests/functional/ancestral/cram/invalid-args.t
index e65d64768..5cc1d8632 100644
--- a/tests/functional/ancestral/cram/invalid-args.t
+++ b/tests/functional/ancestral/cram/invalid-args.t
@@ -7,6 +7,7 @@ Input FASTA + VCF output is not possible
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-vcf "output.vcf" > /dev/null
   ERROR: VCF output has been requested but the input alignment is not VCF.
   [2]
@@ -16,6 +17,7 @@ Input VCF + FASTA output is not possible (Note that the input file doesn't exist
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/snps.vcf \
+  >  --seed 314159 \
   >  --output-sequences "output.fasta" > /dev/null
   ERROR: Sequence (fasta) output has been requested but the input alignment is VCF.
   [2]
@@ -25,6 +27,7 @@ Output FASTA _and_ VCF is not possible
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-vcf "output.vcf" \
   >  --output-sequences "output.fasta" > /dev/null
   ERROR: Both sequence (fasta) and VCF output have been requested, but these are incompatible.
@@ -39,28 +42,31 @@ This should fail.
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --annotation $TESTDIR/../data/zika_outgroup.gb \
   >  --genes ENV PRO \
+  >  --seed 314159 \
   >  --output-node-data "ancestral_mutations.json" > /dev/null
   ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
   [2]
 
-Missing tree file 
+Missing tree file
 
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-sequences "output.fasta" > /dev/null
   ERROR: The provided tree file .* doesn't exist (re)
   [2]
 
 
 Attempting to use FASTA-input reference and VCF-input reference args
-(The files here don't exist, but we exit before they're checked) 
+(The files here don't exist, but we exit before they're checked)
 
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --root-sequence $TESTDIR/../data/reference.fasta \
   >  --vcf-reference $TESTDIR/../data/reference.fasta \
+  >  --seed 314159 \
   >  --output-sequences "output.fasta" > /dev/null 2>"err-args.txt"
   [2]
 
diff --git a/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t b/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t
index f8d4fc597..55dac8e32 100644
--- a/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t
+++ b/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t
@@ -9,6 +9,7 @@ There should be N bases in the inferred output sequences.
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --keep-ambiguous \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null
 

From d618f5325b80c0a1c38cb30a79bb16ebcc476b2e Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Thu, 19 Dec 2024 08:29:20 -0800
Subject: [PATCH 11/11] Update change log

---
 CHANGES.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index f9497381b..9d209f767 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,13 +2,16 @@
 
 ## __NEXT__
 
+### Features
+
+* ancestral: Add `--seed` argument to enable deterministic inference of root states by TreeTime. [#1690][] (@huddlej)
+
 ### Bug Fixes
 
-* Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. Fixes issue [#1689][] (@rneher) and was resolved in PR [#1690][].
+* ancestral, refine: Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. [#1690][] (@rneher)
 * titers: Fix type errors in code associated with cross-validation of models. [#1688][] (@huddlej)
 
 [#1688]: https://github.com/nextstrain/augur/pull/1688
-[#1689]: https://github.com/nextstrain/augur/pull/1689
 [#1690]: https://github.com/nextstrain/augur/pull/1690
 
 ## 27.0.0 (9 December 2024)
@@ -32,8 +35,8 @@
 
 ### Features
 
-- This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
-- curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)
+* This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
+* curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)
 
 [#1565]: https://github.com/nextstrain/augur/pull/1565
 [#1566]: https://github.com/nextstrain/augur/issues/1566