From 77be5a7ef027e872f8c28ef6864d20246943ced6 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Nov 2024 20:51:21 +0100 Subject: [PATCH 01/11] fix: explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting --- augur/ancestral.py | 14 +++++++------- augur/refine.py | 5 ++++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index dad3497bc..9a59eca2c 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -85,7 +85,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, # only infer ancestral sequences, leave branch length untouched tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal, - reconstruct_tip_states=infer_tips) + reconstruct_tip_states=infer_tips, sample_from_profile='root') return tt @@ -104,7 +104,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln): only used if is_vcf aln : dict describes variation (relative to reference) per sample. Only used if is_vcf. - + Returns ------- numpy.ndarray(bool) @@ -122,7 +122,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln): if alt=='N': ambig_positions.append(pos) # ambig_positions = [pos for pos, alt in sample_data.items() if alt=='N'] - np.add.at(ambiguous_count, ambig_positions, 1) + np.add.at(ambiguous_count, ambig_positions, 1) # Secondly, if the VCF defines no mutations but the ref is "N": no_info_sites = np.array(list(reference_sequence)) == 'N' no_info_sites[list(variable_sites)] = False @@ -171,7 +171,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf # Note that for mutations reported across the tree we don't have to consider # the mask, because while sites which are all Ns may have an inferred base, - # there will be no variablity and thus no mutations. + # there will be no variablity and thus no mutations. for n in tt.tree.find_clades(): data[n.name] = [a+str(int(pos)+inc)+cm(d) for a,pos,d in n.mutations] @@ -185,7 +185,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf data[tt.tree.root.name].append(f"{root_state}{pos+1}{tree_state}") return data - + def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False): """ Create a full sequence for every node on the tree. Masked positions will @@ -198,7 +198,7 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False): instance of treetime with valid ancestral reconstruction mask : numpy.ndarray(bool) Mask these positions by changing them to the ambiguous nucleotide - reference_sequence : str or None + reference_sequence : str or None infer_ambiguous : bool, optional if true, request the reconstructed sequences from treetime, otherwise retain input ambiguities @@ -437,7 +437,7 @@ def run(args): features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']): raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})" f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).") - + print("Read in {} features from reference sequence file".format(len(features))) for gene in genes: print(f"Processing gene: {gene}") diff --git a/augur/refine.py b/augur/refine.py index 158c86afd..408ec495e 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -313,7 +313,10 @@ def run(args): pass elif args.divergence_units=='mutations': if not args.timetree: - tt.infer_ancestral_sequences() + # infer ancestral sequence for the purpose of counting mutations + # sample mutations from the root profile, otherwise use most likely state. + # Reconstruct tip states to avoid mutations to N or W etc be counted + tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states='True', sample_from_profile='root') nuc_map = profile_maps['nuc'] def are_sequence_states_different(nuc1, nuc2): From 3adf44b13eee6e3b4055ed3f63f928028edf11d3 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Nov 2024 20:57:29 +0100 Subject: [PATCH 02/11] add to changelog --- CHANGES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index aaf8c06cb..f9497381b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,9 +4,12 @@ ### Bug Fixes +* Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. Fixes issue [#1689][] (@rneher) and was resolved in PR [#1690][]. * titers: Fix type errors in code associated with cross-validation of models. [#1688][] (@huddlej) [#1688]: https://github.com/nextstrain/augur/pull/1688 +[#1689]: https://github.com/nextstrain/augur/pull/1689 +[#1690]: https://github.com/nextstrain/augur/pull/1690 ## 27.0.0 (9 December 2024) From 22fa17f5dd44719e894e37db0517a8fac2e4ad1f Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 11 Dec 2024 21:40:14 +0100 Subject: [PATCH 03/11] fix: change string 'True' to bool Co-authored-by: Victor Lin <13424970+victorlin@users.noreply.github.com> --- augur/refine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/refine.py b/augur/refine.py index 408ec495e..c5cbdfe67 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -316,7 +316,7 @@ def run(args): # infer ancestral sequence for the purpose of counting mutations # sample mutations from the root profile, otherwise use most likely state. # Reconstruct tip states to avoid mutations to N or W etc be counted - tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states='True', sample_from_profile='root') + tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states=True, sample_from_profile='root') nuc_map = profile_maps['nuc'] def are_sequence_states_different(nuc1, nuc2): From 2b862de0cf65d9fa325f6f55efec1cf4344fa1dc Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 17 Dec 2024 13:58:52 -0800 Subject: [PATCH 04/11] Add and use random seed interface for ancestral Adds a new `--seed` argument to the ancestral command, following the existing pattern in the refine command, and passes the user-provided value to TreeTime's `TreeAnc` class to ensure deterministic outcomes for random samples. Since this PR changes ancestral's root sequence inference to be stochastic, this new feature allows us to get the same "random" result with each run of our functional tests. Correspondingly, this commit updates the non-VCF functional tests of ancestral to use a fixed seed. This seed should fix the random behavior to a specific outcome, but we need to update our expected JSON outputs to match that outcome here. In this case, the change is an introduction of a "T" at position 14 in the node_root of the "simple genome" data. --- augur/ancestral.py | 16 ++++++++++------ tests/functional/ancestral/cram/case-sensitive.t | 2 ++ tests/functional/ancestral/cram/general.t | 2 ++ .../data/simple-genome/nt_muts.no-ref-seq.json | 11 ++++++----- .../data/simple-genome/nt_muts.ref-seq.json | 10 ++++++---- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index 9a59eca2c..b1cb84f60 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -39,7 +39,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, marginal=False, fill_overhangs=True, infer_tips=False, - alphabet='nuc'): + alphabet='nuc', rng_seed=None): """infer ancestral sequences using TreeTime Parameters @@ -68,6 +68,8 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, alphabet to use for ancestral sequence inference. Default is the nucleotide alphabet that included a gap character 'nuc'. Alternative is `aa` for amino acids. + rng_seed : int, optional + random seed value to use for inference Returns ------- @@ -78,7 +80,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', alphabet=alphabet, - fill_overhangs=fill_overhangs, verbose=1) + fill_overhangs=fill_overhangs, verbose=1, rng_seed=rng_seed) # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False bool_marginal = (marginal == "marginal") @@ -227,14 +229,14 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False): return sequences def run_ancestral(T, aln, reference_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False, - infer_ambiguous=False, marginal=False, alphabet='nuc'): + infer_ambiguous=False, marginal=False, alphabet='nuc', rng_seed=None): """ ancestral nucleotide reconstruction using TreeTime """ tt = ancestral_sequence_inference(tree=T, aln=aln, ref=reference_sequence if is_vcf else None, marginal=marginal, fill_overhangs = fill_overhangs, alphabet=alphabet, - infer_tips = infer_ambiguous) + infer_tips = infer_ambiguous, rng_seed=rng_seed) character_map = {} for x in tt.gtr.profile_map: @@ -295,6 +297,7 @@ def register_parser(parent_subparsers): ) global_options_group.add_argument('--inference', default='joint', choices=["joint", "marginal"], help="calculate joint or marginal maximum likelihood ancestral sequence states") + global_options_group.add_argument('--seed', type=int, help="seed for random number generation") nucleotide_options_group = parser.add_argument_group( "nucleotide options", @@ -416,7 +419,8 @@ def run(args): infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous full_sequences = not is_vcf nuc_result = run_ancestral(T, aln, reference_sequence=ref if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs, - full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc') + full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc', + rng_seed=args.seed) anc_seqs = nuc_result['mutations'] anc_seqs['reference'] = {'nuc': nuc_result['root_seq']} @@ -446,7 +450,7 @@ def run(args): reference_sequence = str(feat.extract(Seq(ref)).translate()) if ref else None aa_result = run_ancestral(T, fname, reference_sequence=reference_sequence, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs, - marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa') + marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa', rng_seed=args.seed) len_translated_alignment = aa_result['tt'].data.full_length*3 if len_translated_alignment != len(feat): raise AugurError(f"length of translated alignment for {gene} ({len_translated_alignment})" diff --git a/tests/functional/ancestral/cram/case-sensitive.t b/tests/functional/ancestral/cram/case-sensitive.t index fbe35df75..2df0b825d 100644 --- a/tests/functional/ancestral/cram/case-sensitive.t +++ b/tests/functional/ancestral/cram/case-sensitive.t @@ -11,6 +11,7 @@ Change the _reference_ to lowercase > --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \ > --root-sequence ref.lower.fasta \ > --output-node-data "nt_muts.ref-seq.json" \ + > --seed 314159 \ > --inference marginal > /dev/null $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ @@ -33,6 +34,7 @@ be lowecase which will be compared against the uppercase reference > --alignment sequences.lower.fasta \ > --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \ > --output-node-data "nt_muts.ref-seq.json" \ + > --seed 314159 \ > --inference marginal > /dev/null $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ diff --git a/tests/functional/ancestral/cram/general.t b/tests/functional/ancestral/cram/general.t index 9591db36a..1f68d8a9a 100644 --- a/tests/functional/ancestral/cram/general.t +++ b/tests/functional/ancestral/cram/general.t @@ -14,6 +14,7 @@ node-data JSON we diff against. > --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \ > --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \ > --output-node-data "nt_muts.ref-seq.json" \ + > --seed 314159 \ > --inference marginal > /dev/null @@ -34,6 +35,7 @@ mutations (as there's nothing to compare the root node to) > --tree "$TESTDIR/../data/simple-genome/tree.nwk" \ > --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \ > --output-node-data "nt_muts.no-ref-seq.json" \ + > --seed 314159 \ > --inference marginal > /dev/null diff --git a/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json b/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json index 402055aa5..69e72ee12 100644 --- a/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json +++ b/tests/functional/ancestral/data/simple-genome/nt_muts.no-ref-seq.json @@ -16,7 +16,6 @@ "node_AB": { "muts": [ "A7G", - "C14T", "T18C", "A33G", "A43T" @@ -25,7 +24,7 @@ }, "node_root": { "muts": [], - "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + "sequence": "AAAACAAAAATGCTCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" }, "sample_A": { "muts": [ @@ -41,11 +40,13 @@ "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA" }, "sample_C": { - "muts": [], + "muts": [ + "T14C" + ], "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" } }, "reference": { - "nuc": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + "nuc": "AAAACAAAAATGCTCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" } -} \ No newline at end of file +} diff --git a/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json index b059b5647..dd92237cc 100644 --- a/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json +++ b/tests/functional/ancestral/data/simple-genome/nt_muts.ref-seq.json @@ -12,7 +12,6 @@ "node_AB": { "muts": [ "A7G", - "C14T", "T18C", "A33G", "A43T" @@ -22,9 +21,10 @@ "node_root": { "muts": [ "A5C", + "C14T", "C18T" ], - "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" + "sequence": "AAAACAAAAATGCTCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" }, "sample_A": { "muts": [ @@ -40,11 +40,13 @@ "sequence": "AAAACAGAAATGCTCTGCGGGTAAAAAAAAAAGAACTACTTATCCATAAA" }, "sample_C": { - "muts": [], + "muts": [ + "T14C" + ], "sequence": "AAAACAAAAATGCCCTGTGGGTAAAAAAAAAAAAACTACTTGACCATAAA" } }, "reference": { "nuc": "AAAAAAAAAATGCCCTGCGGGTAAAAAAAAAAAAACTACTTGACCATAAA" } -} \ No newline at end of file +} From 0da5c11011b50e641a1a206474631c6fbf67981c Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 17 Dec 2024 14:04:23 -0800 Subject: [PATCH 05/11] Use random seed in simple VCF test of ancestral As in the previous commit, this commit updates the expected VCF output to match the fixed "random" outcome for the given seed where position 14 is a T in the root node. --- tests/functional/ancestral/cram/vcf.t | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/functional/ancestral/cram/vcf.t b/tests/functional/ancestral/cram/vcf.t index ae234d7f5..d1f2f3f67 100644 --- a/tests/functional/ancestral/cram/vcf.t +++ b/tests/functional/ancestral/cram/vcf.t @@ -15,6 +15,7 @@ but it will have the reference sequence attached. > --vcf-reference $DATA/reference.fasta \ > --output-node-data "nt_muts.vcf-input.ref-seq.json" \ > --output-vcf nt_muts.vcf \ + > --seed 314159 \ > --inference marginal > /dev/null @@ -33,7 +34,7 @@ here as it's not relevant to what I'm trying to test. > #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT node_root sample_C node_AB sample_B sample_A > 1 5 . A C . PASS . GT 1 1 1 1 1 > 1 7 . A G . PASS . GT 0 0 1 1 1 - > 1 14 . C T . PASS . GT 0 0 1 1 1 + > 1 14 . C T . PASS . GT 1 0 1 1 1 > 1 18 . C T . PASS . GT 1 1 0 0 0 > 1 33 . A C,G . PASS . GT 0 0 2 2 1 > 1 39 . C T . PASS . GT 0 0 0 0 1 From e0a9a1cc0f536ca0be189b354619a6ca7ec52f36 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 17 Dec 2024 14:06:48 -0800 Subject: [PATCH 06/11] Use random seed for multiallele VCF ancestral test Updates the functional test of multiallele VCF behavior when one of the alleles is a "N" character. The multiallele VCF in the original test (setting sample B to genotype of 30G) caused the inference of alleles at sites 7 and 14 to change substantially in this PR's new implementation. This commit stabilizes the test by assigning both samples A and B to the 30G genotype such that all nodes are inferred to have a 30G and the root node gets a A30G mutation. It should maintain the originally intended behavior of the test [1]. As part of this change, I've replaced the dynamically modified VCF used by this test with a static version of the VCF in version control. This change simplified test debugging by allowing me to inspect the input file outside of the cram temporary environment and it also fixes an issue with the original dynamic VCF where an additional line for site 30 was added but the original line was not being removed. [1] https://github.com/nextstrain/augur/issues/1380 --- .../functional/ancestral/cram/vcf-multi-allele.t | 14 ++++++-------- .../simple-genome/snps_with_multiple_alleles.vcf | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf diff --git a/tests/functional/ancestral/cram/vcf-multi-allele.t b/tests/functional/ancestral/cram/vcf-multi-allele.t index dfa52a7f4..cb26dc8e9 100644 --- a/tests/functional/ancestral/cram/vcf-multi-allele.t +++ b/tests/functional/ancestral/cram/vcf-multi-allele.t @@ -4,20 +4,18 @@ Setup $ export DATA="$TESTDIR/../data/simple-genome" -We take the same `snps.vcf` file used in `general.t` but add another +We used a modified `snps.vcf` file used in `vcf.t` but with an additional allele at site 30 - sample_B has a "G". Since the root is A and this is the only sample with G it's 'A30G'. See for the bug this is testing. - $ sed '11s/^/1\t30\t.\tA\tG,N\t.\t.\t.\tGT\t0\t1\t2\n/' \ - > "$DATA/snps.vcf" > snps2.vcf - $ ${AUGUR} ancestral \ > --tree $DATA/tree.nwk \ - > --alignment snps2.vcf \ + > --alignment $DATA/snps_with_multiple_alleles.vcf \ > --vcf-reference $DATA/reference.fasta \ > --output-node-data nt_muts.json \ > --output-vcf nt_muts.vcf \ + > --seed 314159 \ > --inference marginal > /dev/null @@ -25,15 +23,15 @@ See for the bug this is testin > "$DATA/nt_muts.ref-seq.json" \ > nt_muts.json \ > --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]" "root\['generated_by'\]" - {'iterable_item_added': {"root['nodes']['sample_B']['muts'][0]": 'A30G'}} + {'iterable_item_added': {"root['nodes']['node_root']['muts'][3]": 'A30G'}} $ cat > expected.vcf < #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT node_root sample_C node_AB sample_B sample_A > 1 5 . A C . PASS . GT 1 1 1 1 1 > 1 7 . A G . PASS . GT 0 0 1 1 1 - > 1 14 . C T . PASS . GT 0 0 1 1 1 + > 1 14 . C T . PASS . GT 1 0 1 1 1 > 1 18 . C T . PASS . GT 1 1 0 0 0 - > 1 30 . A G . PASS . GT 0 0 0 1 0 + > 1 30 . A G . PASS . GT 1 1 1 1 1 > 1 33 . A C,G . PASS . GT 0 0 2 2 1 > 1 39 . C T . PASS . GT 0 0 0 0 1 > 1 42 . G A . PASS . GT 0 0 0 1 0 diff --git a/tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf b/tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf new file mode 100644 index 000000000..4a311800d --- /dev/null +++ b/tests/functional/ancestral/data/simple-genome/snps_with_multiple_alleles.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_A sample_B sample_C +1 5 . A C . . . GT 1 1 1 +1 7 . A G . . . GT 1 1 0 +1 14 . C T . . . GT 1 1 0 +1 18 . C T . . . GT 0 0 1 +1 28 . A N . . . GT 0 0 1 +1 29 . A N . . . GT 0 0 1 +1 30 . A G,N . . . GT 1 1 2 +1 33 . A C,G . . . GT 1 2 0 +1 39 . C T . . . GT 1 0 0 +1 42 . G A . . . GT 0 1 0 +1 43 . A T . . . GT 1 1 0 From 10881771e373cf895249fcb9c40cc3bb5c736ad1 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 17 Dec 2024 14:43:35 -0800 Subject: [PATCH 07/11] Update expected branch lengths for functional test The change to how augur refine can stochastically assign the root node mutations led to a change in inferred branch lengths for our functional tests. Since the test uses a random seed argument for augur refine, the results of subsequent runs should be deterministically successful after this change. --- tests/functional/refine/data/integer_branch_lengths.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional/refine/data/integer_branch_lengths.json b/tests/functional/refine/data/integer_branch_lengths.json index e577c8787..ee9b99082 100644 --- a/tests/functional/refine/data/integer_branch_lengths.json +++ b/tests/functional/refine/data/integer_branch_lengths.json @@ -46,10 +46,10 @@ "branch_length": 2 }, "NODE_0000006": { - "branch_length": 11 + "branch_length": 10 }, "NODE_0000007": { - "branch_length": 2 + "branch_length": 3 }, "NODE_0000008": { "branch_length": 6 From 28dfae9491aee3a7c7548c829dc164f7d2677966 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 17 Dec 2024 16:35:24 -0800 Subject: [PATCH 08/11] Align data for ancestral and translate tests Updates data for augur translate functional tests to match the updated test data for the ancestral command. --- tests/functional/translate/cram/root-mutations.t | 2 +- .../translate/data/simple-genome/aa_muts.json | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/functional/translate/cram/root-mutations.t b/tests/functional/translate/cram/root-mutations.t index de9c0f55d..151f36e93 100644 --- a/tests/functional/translate/cram/root-mutations.t +++ b/tests/functional/translate/cram/root-mutations.t @@ -28,4 +28,4 @@ is unchanged (MPCG*). There is also a mutation E4G at the root node to compensat > "$DATA/aa_muts.json" \ > "aa_muts.json" \ > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']" - {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][0]": 'E4G'}} \ No newline at end of file + {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][1]": 'E4G'}} diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json index 375ee7cb3..0fc6298bb 100644 --- a/tests/functional/translate/data/simple-genome/aa_muts.json +++ b/tests/functional/translate/data/simple-genome/aa_muts.json @@ -29,9 +29,7 @@ "nodes": { "node_AB": { "aa_muts": { - "gene1": [ - "P2L" - ], + "gene1": [], "gene2": [ "V2D" ] @@ -39,11 +37,13 @@ }, "node_root": { "aa_muts": { - "gene1": [], + "gene1": [ + "P2L" + ], "gene2": [] }, "aa_sequences": { - "gene1": "MPCG*", + "gene1": "MLCG*", "gene2": "MVK*" } }, @@ -61,7 +61,9 @@ }, "sample_C": { "aa_muts": { - "gene1": [], + "gene1": [ + "L2P" + ], "gene2": [] } } @@ -70,4 +72,4 @@ "gene1": "MPCG*", "gene2": "MVK*" } -} \ No newline at end of file +} From bb9603cd0c7fc44e894e2cf20994d5261d7541f1 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Wed, 18 Dec 2024 08:58:12 -0800 Subject: [PATCH 09/11] Fix test data for augur translate tests Updates SNPs in manually-curated VCF used to test augur translate so they match the new expected genotype for the "simple genome" data at position 14. This fixes the vcf.t translate test. Also updates the expected amino acid translations JSON for the vcf with root mutation test. As part of this update, I've created a complete copy of the "truth" aa_muts.json with the root mutation and used this instead of the version that was previously created by modifying the original aa_muts file with sed. Although this change duplicates test data, I was able to debug the test more easily with a complete version of the JSON living outside of the cram tests. --- .../translate/cram/vcf-with-root-mutation.t | 12 ++- .../aa_muts_with_root_mutation.json | 76 +++++++++++++++++++ .../data/simple-genome/snps-inferred.vcf | 2 +- 3 files changed, 82 insertions(+), 8 deletions(-) create mode 100644 tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json diff --git a/tests/functional/translate/cram/vcf-with-root-mutation.t b/tests/functional/translate/cram/vcf-with-root-mutation.t index 36b4a4f65..ef808cc8b 100644 --- a/tests/functional/translate/cram/vcf-with-root-mutation.t +++ b/tests/functional/translate/cram/vcf-with-root-mutation.t @@ -34,14 +34,12 @@ The _reference_ produced is the actual reference, not using the mutations in the MVK* (no-eol) However the aa_mutations should annotate the aa_sequence on the root node as -having the G3E AA mutation, i.e. MPCE* instead of MPCG*, as well as a -corresponding AA mutation on the root node G4E (i.e. reference is G, but root -node is E (and so are all the other nodes)) - - $ sed '46s/MPCG/MPCE/' "$DATA/aa_muts.json" | sed '42s/\[\]/\["G4E"\]/' > aa_muts.truth.json +having the G4E AA mutation, as well as a corresponding AA mutation on the root +node G4E (i.e. reference is G, but root node is E (and so are all the other +nodes)). $ python3 "$SCRIPTS/diff_jsons.py" \ - > aa_muts.truth.json \ + > $DATA/aa_muts_with_root_mutation.json \ > aa_muts.json \ > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" "root['meta']['updated']" - {} \ No newline at end of file + {} diff --git a/tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json b/tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json new file mode 100644 index 000000000..794cfe42f --- /dev/null +++ b/tests/functional/translate/data/simple-genome/aa_muts_with_root_mutation.json @@ -0,0 +1,76 @@ +{ + "annotations": { + "gene1": { + "end": 24, + "seqid": "data/reference.gff", + "start": 10, + "strand": "+", + "type": "gene" + }, + "gene2": { + "end": 47, + "seqid": "data/reference.gff", + "start": 36, + "strand": "-", + "type": "gene" + }, + "nuc": { + "end": 50, + "seqid": "data/reference.gff", + "start": 1, + "strand": "+", + "type": "##sequence-region pragma" + } + }, + "generated_by": { + "program": "augur", + "version": "23.1.1" + }, + "nodes": { + "node_AB": { + "aa_muts": { + "gene1": [], + "gene2": [ + "V2D" + ] + } + }, + "node_root": { + "aa_muts": { + "gene1": [ + "P2L", + "G4E" + ], + "gene2": [] + }, + "aa_sequences": { + "gene1": "MLCE*", + "gene2": "MVK*" + } + }, + "sample_A": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + }, + "sample_B": { + "aa_muts": { + "gene1": [], + "gene2": [] + } + }, + "sample_C": { + "aa_muts": { + "gene1": [ + "L2P" + ], + "gene2": [] + } + } + }, + "reference": { + "gene1": "MPCG*", + "gene2": "MVK*" + } +} diff --git a/tests/functional/translate/data/simple-genome/snps-inferred.vcf b/tests/functional/translate/data/simple-genome/snps-inferred.vcf index b5a3c571e..e267817b4 100644 --- a/tests/functional/translate/data/simple-genome/snps-inferred.vcf +++ b/tests/functional/translate/data/simple-genome/snps-inferred.vcf @@ -5,7 +5,7 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT node_root node_AB sample_A sample_B sample_C 1 5 . A C . . . GT 1 1 1 1 1 1 7 . A G . . . GT 0 1 1 1 0 -1 14 . C T . . . GT 0 1 1 1 0 +1 14 . C T . . . GT 1 1 1 1 0 1 18 . C T . . . GT 1 0 0 0 1 1 33 . A G,C . . . GT 0 1 2 1 0 1 39 . C T . . . GT 0 0 1 0 0 From d39198977aeb63c6bda732e94005c3119d7b2c07 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Wed, 18 Dec 2024 09:31:05 -0800 Subject: [PATCH 10/11] Use random seed in all ancestral tests Adds random seed argument to all functional tests of the ancestral command. This should hopefully resolve remaining stochastic changes in cram runs during CI. --- tests/functional/ancestral/cram/ambiguous-positions.t | 2 +- .../ancestral/cram/default-nucleotide-reconstruction.t | 1 + .../ancestral/cram/infer-ambiguous-nucleotides.t | 1 + .../cram/infer-amino-acid-sequences-genes-file.t | 1 + .../infer-amino-acid-sequences-with-root-sequence.t | 1 + .../ancestral/cram/infer-amino-acid-sequences.t | 1 + tests/functional/ancestral/cram/invalid-args.t | 10 ++++++++-- .../ancestral/cram/keep-ambiguous-nucleotides.t | 1 + 8 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/functional/ancestral/cram/ambiguous-positions.t b/tests/functional/ancestral/cram/ambiguous-positions.t index 0fc7b124e..ae82549d7 100644 --- a/tests/functional/ancestral/cram/ambiguous-positions.t +++ b/tests/functional/ancestral/cram/ambiguous-positions.t @@ -19,6 +19,7 @@ Setup > --tree "$DATA/tree.nwk" \ > --alignment snps.vcf \ > --vcf-reference reference.fasta \ + > --seed 314159 \ > --output-node-data nt_muts.json \ > --output-vcf nt_muts.vcf > /dev/null @@ -44,4 +45,3 @@ Setup > --vcf nt_muts.vcf \ > --json nt_muts.json \ > --ref reference.fasta > /dev/null - diff --git a/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t b/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t index cc38cdbfe..2f3b59bdd 100644 --- a/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t +++ b/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t @@ -8,6 +8,7 @@ The default is to infer ambiguous bases, so there should not be N bases in the i $ ${AUGUR} ancestral \ > --tree $TESTDIR/../data/tree.nwk \ > --alignment $TESTDIR/../data/aligned.fasta \ + > --seed 314159 \ > --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \ > --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null diff --git a/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t b/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t index 96700d336..d6a7e1926 100644 --- a/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t +++ b/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t @@ -9,6 +9,7 @@ There should not be N bases in the inferred output sequences. > --tree $TESTDIR/../data/tree.nwk \ > --alignment $TESTDIR/../data/aligned.fasta \ > --infer-ambiguous \ + > --seed 314159 \ > --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \ > --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t index 09ed75bb4..164a9e45c 100644 --- a/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t +++ b/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t @@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences, using a genes file. > --annotation $TESTDIR/../data/zika_outgroup.gb \ > --genes $TESTDIR/../data/genes.txt \ > --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \ + > --seed 314159 \ > --output-node-data ancestral_mutations.json \ > --output-sequences ancestral_sequences.fasta \ > --output-translations ancestral_aa_sequences_%GENE.fasta > /dev/null diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t index 0d61ca296..333fb0fb0 100644 --- a/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t +++ b/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t @@ -15,6 +15,7 @@ ancestor). > --root-sequence $TESTDIR/../data/zika_outgroup.gb \ > --genes ENV PRO \ > --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \ + > --seed 314159 \ > --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" > /dev/null Check that the reference length was correctly exported as the nuc annotation diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences.t index 35804fdf1..d1c3a8f23 100644 --- a/tests/functional/ancestral/cram/infer-amino-acid-sequences.t +++ b/tests/functional/ancestral/cram/infer-amino-acid-sequences.t @@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences. > --annotation $TESTDIR/../data/zika_outgroup.gb \ > --genes ENV PRO \ > --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \ + > --seed 314159 \ > --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \ > --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" \ > --output-translations "$CRAMTMP/$TESTFILE/ancestral_aa_sequences_%GENE.fasta" > /dev/null diff --git a/tests/functional/ancestral/cram/invalid-args.t b/tests/functional/ancestral/cram/invalid-args.t index e65d64768..5cc1d8632 100644 --- a/tests/functional/ancestral/cram/invalid-args.t +++ b/tests/functional/ancestral/cram/invalid-args.t @@ -7,6 +7,7 @@ Input FASTA + VCF output is not possible $ ${AUGUR} ancestral \ > --tree $TESTDIR/../data/tree.nwk \ > --alignment $TESTDIR/../data/aligned.fasta \ + > --seed 314159 \ > --output-vcf "output.vcf" > /dev/null ERROR: VCF output has been requested but the input alignment is not VCF. [2] @@ -16,6 +17,7 @@ Input VCF + FASTA output is not possible (Note that the input file doesn't exist $ ${AUGUR} ancestral \ > --tree $TESTDIR/../data/tree.nwk \ > --alignment $TESTDIR/../data/snps.vcf \ + > --seed 314159 \ > --output-sequences "output.fasta" > /dev/null ERROR: Sequence (fasta) output has been requested but the input alignment is VCF. [2] @@ -25,6 +27,7 @@ Output FASTA _and_ VCF is not possible $ ${AUGUR} ancestral \ > --tree $TESTDIR/../data/tree.nwk \ > --alignment $TESTDIR/../data/aligned.fasta \ + > --seed 314159 \ > --output-vcf "output.vcf" \ > --output-sequences "output.fasta" > /dev/null ERROR: Both sequence (fasta) and VCF output have been requested, but these are incompatible. @@ -39,28 +42,31 @@ This should fail. > --alignment $TESTDIR/../data/aligned.fasta \ > --annotation $TESTDIR/../data/zika_outgroup.gb \ > --genes ENV PRO \ + > --seed 314159 \ > --output-node-data "ancestral_mutations.json" > /dev/null ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences. [2] -Missing tree file +Missing tree file $ ${AUGUR} ancestral \ > --tree $TESTDIR/../data/tree-doesnt-exist.nwk \ > --alignment $TESTDIR/../data/aligned.fasta \ + > --seed 314159 \ > --output-sequences "output.fasta" > /dev/null ERROR: The provided tree file .* doesn't exist (re) [2] Attempting to use FASTA-input reference and VCF-input reference args -(The files here don't exist, but we exit before they're checked) +(The files here don't exist, but we exit before they're checked) $ ${AUGUR} ancestral \ > --tree $TESTDIR/../data/tree-doesnt-exist.nwk \ > --alignment $TESTDIR/../data/aligned.fasta \ > --root-sequence $TESTDIR/../data/reference.fasta \ > --vcf-reference $TESTDIR/../data/reference.fasta \ + > --seed 314159 \ > --output-sequences "output.fasta" > /dev/null 2>"err-args.txt" [2] diff --git a/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t b/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t index f8d4fc597..55dac8e32 100644 --- a/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t +++ b/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t @@ -9,6 +9,7 @@ There should be N bases in the inferred output sequences. > --tree $TESTDIR/../data/tree.nwk \ > --alignment $TESTDIR/../data/aligned.fasta \ > --keep-ambiguous \ + > --seed 314159 \ > --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \ > --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null From d618f5325b80c0a1c38cb30a79bb16ebcc476b2e Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 19 Dec 2024 08:29:20 -0800 Subject: [PATCH 11/11] Update change log --- CHANGES.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f9497381b..9d209f767 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,13 +2,16 @@ ## __NEXT__ +### Features + +* ancestral: Add `--seed` argument to enable deterministic inference of root states by TreeTime. [#1690][] (@huddlej) + ### Bug Fixes -* Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. Fixes issue [#1689][] (@rneher) and was resolved in PR [#1690][]. +* ancestral, refine: Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. [#1690][] (@rneher) * titers: Fix type errors in code associated with cross-validation of models. [#1688][] (@huddlej) [#1688]: https://github.com/nextstrain/augur/pull/1688 -[#1689]: https://github.com/nextstrain/augur/pull/1689 [#1690]: https://github.com/nextstrain/augur/pull/1690 ## 27.0.0 (9 December 2024) @@ -32,8 +35,8 @@ ### Features -- This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin) -- curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521) +* This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin) +* curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521) [#1565]: https://github.com/nextstrain/augur/pull/1565 [#1566]: https://github.com/nextstrain/augur/issues/1566