Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: explicitly specify how the root and ambiguous states are handled… #1690

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
10 changes: 8 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@

## __NEXT__

### Features

* ancestral: Add `--seed` argument to enable deterministic inference of root states by TreeTime. [#1690][] (@huddlej)

### Bug Fixes

* ancestral, refine: Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. [#1690][] (@rneher)
* titers: Fix type errors in code associated with cross-validation of models. [#1688][] (@huddlej)

[#1688]: https://github.com/nextstrain/augur/pull/1688
[#1690]: https://github.com/nextstrain/augur/pull/1690

## 27.0.0 (9 December 2024)

Expand All @@ -29,8 +35,8 @@

### Features

- This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
- curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)
* This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
* curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)

[#1565]: https://github.com/nextstrain/augur/pull/1565
[#1566]: https://github.com/nextstrain/augur/issues/1566
Expand Down
30 changes: 17 additions & 13 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
marginal=False, fill_overhangs=True, infer_tips=False,
alphabet='nuc'):
alphabet='nuc', rng_seed=None):
"""infer ancestral sequences using TreeTime

Parameters
Expand Down Expand Up @@ -68,6 +68,8 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
alphabet to use for ancestral sequence inference. Default is the nucleotide
alphabet that included a gap character 'nuc'. Alternative is `aa` for amino
acids.
rng_seed : int, optional
random seed value to use for inference

Returns
-------
Expand All @@ -78,14 +80,14 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
from treetime import TreeAnc

tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', alphabet=alphabet,
fill_overhangs=fill_overhangs, verbose=1)
fill_overhangs=fill_overhangs, verbose=1, rng_seed=rng_seed)

# convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
bool_marginal = (marginal == "marginal")

# only infer ancestral sequences, leave branch length untouched
tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal,
reconstruct_tip_states=infer_tips)
reconstruct_tip_states=infer_tips, sample_from_profile='root')

return tt

Expand All @@ -104,7 +106,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
only used if is_vcf
aln : dict
describes variation (relative to reference) per sample. Only used if is_vcf.

Returns
-------
numpy.ndarray(bool)
Expand All @@ -122,7 +124,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
if alt=='N':
ambig_positions.append(pos)
# ambig_positions = [pos for pos, alt in sample_data.items() if alt=='N']
np.add.at(ambiguous_count, ambig_positions, 1)
np.add.at(ambiguous_count, ambig_positions, 1)
# Secondly, if the VCF defines no mutations but the ref is "N":
no_info_sites = np.array(list(reference_sequence)) == 'N'
no_info_sites[list(variable_sites)] = False
Expand Down Expand Up @@ -171,7 +173,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf

# Note that for mutations reported across the tree we don't have to consider
# the mask, because while sites which are all Ns may have an inferred base,
# there will be no variablity and thus no mutations.
# there will be no variablity and thus no mutations.
for n in tt.tree.find_clades():
data[n.name] = [a+str(int(pos)+inc)+cm(d)
for a,pos,d in n.mutations]
Expand All @@ -185,7 +187,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf
data[tt.tree.root.name].append(f"{root_state}{pos+1}{tree_state}")

return data

def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
"""
Create a full sequence for every node on the tree. Masked positions will
Expand All @@ -198,7 +200,7 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
instance of treetime with valid ancestral reconstruction
mask : numpy.ndarray(bool)
Mask these positions by changing them to the ambiguous nucleotide
reference_sequence : str or None
reference_sequence : str or None
infer_ambiguous : bool, optional
if true, request the reconstructed sequences from treetime, otherwise retain input ambiguities

Expand Down Expand Up @@ -227,14 +229,14 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
return sequences

def run_ancestral(T, aln, reference_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
infer_ambiguous=False, marginal=False, alphabet='nuc'):
infer_ambiguous=False, marginal=False, alphabet='nuc', rng_seed=None):
"""
ancestral nucleotide reconstruction using TreeTime
"""

tt = ancestral_sequence_inference(tree=T, aln=aln, ref=reference_sequence if is_vcf else None, marginal=marginal,
fill_overhangs = fill_overhangs, alphabet=alphabet,
infer_tips = infer_ambiguous)
infer_tips = infer_ambiguous, rng_seed=rng_seed)

character_map = {}
for x in tt.gtr.profile_map:
Expand Down Expand Up @@ -295,6 +297,7 @@ def register_parser(parent_subparsers):
)
global_options_group.add_argument('--inference', default='joint', choices=["joint", "marginal"],
help="calculate joint or marginal maximum likelihood ancestral sequence states")
global_options_group.add_argument('--seed', type=int, help="seed for random number generation")

nucleotide_options_group = parser.add_argument_group(
"nucleotide options",
Expand Down Expand Up @@ -416,7 +419,8 @@ def run(args):
infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous
full_sequences = not is_vcf
nuc_result = run_ancestral(T, aln, reference_sequence=ref if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc')
full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc',
rng_seed=args.seed)
anc_seqs = nuc_result['mutations']
anc_seqs['reference'] = {'nuc': nuc_result['root_seq']}

Expand All @@ -437,7 +441,7 @@ def run(args):
features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")

print("Read in {} features from reference sequence file".format(len(features)))
for gene in genes:
print(f"Processing gene: {gene}")
Expand All @@ -446,7 +450,7 @@ def run(args):
reference_sequence = str(feat.extract(Seq(ref)).translate()) if ref else None

aa_result = run_ancestral(T, fname, reference_sequence=reference_sequence, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa', rng_seed=args.seed)
len_translated_alignment = aa_result['tt'].data.full_length*3
if len_translated_alignment != len(feat):
raise AugurError(f"length of translated alignment for {gene} ({len_translated_alignment})"
Expand Down
5 changes: 4 additions & 1 deletion augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,10 @@ def run(args):
pass
elif args.divergence_units=='mutations':
if not args.timetree:
tt.infer_ancestral_sequences()
# infer ancestral sequence for the purpose of counting mutations
# sample mutations from the root profile, otherwise use most likely state.
# Reconstruct tip states to avoid mutations to N or W etc be counted
tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states=True, sample_from_profile='root')
nuc_map = profile_maps['nuc']

def are_sequence_states_different(nuc1, nuc2):
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/ancestral/cram/ambiguous-positions.t
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Setup
> --tree "$DATA/tree.nwk" \
> --alignment snps.vcf \
> --vcf-reference reference.fasta \
> --seed 314159 \
> --output-node-data nt_muts.json \
> --output-vcf nt_muts.vcf > /dev/null

Expand All @@ -44,4 +45,3 @@ Setup
> --vcf nt_muts.vcf \
> --json nt_muts.json \
> --ref reference.fasta > /dev/null

2 changes: 2 additions & 0 deletions tests/functional/ancestral/cram/case-sensitive.t
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Change the _reference_ to lowercase
> --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
> --root-sequence ref.lower.fasta \
> --output-node-data "nt_muts.ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
Expand All @@ -33,6 +34,7 @@ be lowecase which will be compared against the uppercase reference
> --alignment sequences.lower.fasta \
> --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
> --output-node-data "nt_muts.ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The default is to infer ambiguous bases, so there should not be N bases in the i
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null

Expand Down
2 changes: 2 additions & 0 deletions tests/functional/ancestral/cram/general.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ node-data JSON we diff against.
> --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
> --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
> --output-node-data "nt_muts.ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null


Expand All @@ -34,6 +35,7 @@ mutations (as there's nothing to compare the root node to)
> --tree "$TESTDIR/../data/simple-genome/tree.nwk" \
> --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
> --output-node-data "nt_muts.no-ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ There should not be N bases in the inferred output sequences.
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --infer-ambiguous \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences, using a genes file.
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes $TESTDIR/../data/genes.txt \
> --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
> --seed 314159 \
> --output-node-data ancestral_mutations.json \
> --output-sequences ancestral_sequences.fasta \
> --output-translations ancestral_aa_sequences_%GENE.fasta > /dev/null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ ancestor).
> --root-sequence $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" > /dev/null

Check that the reference length was correctly exported as the nuc annotation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences.
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" \
> --output-translations "$CRAMTMP/$TESTFILE/ancestral_aa_sequences_%GENE.fasta" > /dev/null
Expand Down
10 changes: 8 additions & 2 deletions tests/functional/ancestral/cram/invalid-args.t
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Input FASTA + VCF output is not possible
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-vcf "output.vcf" > /dev/null
ERROR: VCF output has been requested but the input alignment is not VCF.
[2]
Expand All @@ -16,6 +17,7 @@ Input VCF + FASTA output is not possible (Note that the input file doesn't exist
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/snps.vcf \
> --seed 314159 \
> --output-sequences "output.fasta" > /dev/null
ERROR: Sequence (fasta) output has been requested but the input alignment is VCF.
[2]
Expand All @@ -25,6 +27,7 @@ Output FASTA _and_ VCF is not possible
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-vcf "output.vcf" \
> --output-sequences "output.fasta" > /dev/null
ERROR: Both sequence (fasta) and VCF output have been requested, but these are incompatible.
Expand All @@ -39,28 +42,31 @@ This should fail.
> --alignment $TESTDIR/../data/aligned.fasta \
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --seed 314159 \
> --output-node-data "ancestral_mutations.json" > /dev/null
ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
[2]

Missing tree file
Missing tree file

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-sequences "output.fasta" > /dev/null
ERROR: The provided tree file .* doesn't exist (re)
[2]


Attempting to use FASTA-input reference and VCF-input reference args
(The files here don't exist, but we exit before they're checked)
(The files here don't exist, but we exit before they're checked)

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --root-sequence $TESTDIR/../data/reference.fasta \
> --vcf-reference $TESTDIR/../data/reference.fasta \
> --seed 314159 \
> --output-sequences "output.fasta" > /dev/null 2>"err-args.txt"
[2]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ There should be N bases in the inferred output sequences.
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --keep-ambiguous \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null

Expand Down
14 changes: 6 additions & 8 deletions tests/functional/ancestral/cram/vcf-multi-allele.t
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,34 @@ Setup

$ export DATA="$TESTDIR/../data/simple-genome"

We take the same `snps.vcf` file used in `general.t` but add another
We used a modified `snps.vcf` file used in `vcf.t` but with an additional
allele at site 30 - sample_B has a "G". Since the root is A and this
is the only sample with G it's 'A30G'.
See <https://github.com/nextstrain/augur/issues/1380> for the bug this is testing.

$ sed '11s/^/1\t30\t.\tA\tG,N\t.\t.\t.\tGT\t0\t1\t2\n/' \
> "$DATA/snps.vcf" > snps2.vcf

$ ${AUGUR} ancestral \
> --tree $DATA/tree.nwk \
> --alignment snps2.vcf \
> --alignment $DATA/snps_with_multiple_alleles.vcf \
> --vcf-reference $DATA/reference.fasta \
> --output-node-data nt_muts.json \
> --output-vcf nt_muts.vcf \
> --seed 314159 \
> --inference marginal > /dev/null


$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
> "$DATA/nt_muts.ref-seq.json" \
> nt_muts.json \
> --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]" "root\['generated_by'\]"
{'iterable_item_added': {"root['nodes']['sample_B']['muts'][0]": 'A30G'}}
{'iterable_item_added': {"root['nodes']['node_root']['muts'][3]": 'A30G'}}

$ cat > expected.vcf <<EOF
> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT node_root sample_C node_AB sample_B sample_A
> 1 5 . A C . PASS . GT 1 1 1 1 1
> 1 7 . A G . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 1 0 1 1 1
> 1 18 . C T . PASS . GT 1 1 0 0 0
> 1 30 . A G . PASS . GT 0 0 0 1 0
> 1 30 . A G . PASS . GT 1 1 1 1 1
> 1 33 . A C,G . PASS . GT 0 0 2 2 1
> 1 39 . C T . PASS . GT 0 0 0 0 1
> 1 42 . G A . PASS . GT 0 0 0 1 0
Expand Down
3 changes: 2 additions & 1 deletion tests/functional/ancestral/cram/vcf.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ but it will have the reference sequence attached.
> --vcf-reference $DATA/reference.fasta \
> --output-node-data "nt_muts.vcf-input.ref-seq.json" \
> --output-vcf nt_muts.vcf \
> --seed 314159 \
> --inference marginal > /dev/null


Expand All @@ -33,7 +34,7 @@ here as it's not relevant to what I'm trying to test.
> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT node_root sample_C node_AB sample_B sample_A
> 1 5 . A C . PASS . GT 1 1 1 1 1
> 1 7 . A G . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 1 0 1 1 1
> 1 18 . C T . PASS . GT 1 1 0 0 0
> 1 33 . A C,G . PASS . GT 0 0 2 2 1
> 1 39 . C T . PASS . GT 0 0 0 0 1
Expand Down
Loading
Loading