Skip to content

Commit

Permalink
feat(call): include larger VCF anchors - capture 5' variation
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Nov 19, 2024
1 parent 155b0ff commit 7c43188
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 14 deletions.
4 changes: 4 additions & 0 deletions docs/caller_usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
* `--consensus` or `-c`: Turn on consensus calculation for alleles. This adds runtime, but gives a better idea of STR
structure and is useful for comparing alleles beyond copy number. If `--vcf` is set, this option is forced on.
**Default:** off
* `--vcf-anchor-size`: Number of bases upstream (5') of the tandem repeat to include in the VCF output. This can include
small indels, and having a size above `1` may be beneficial or detrimental to the use case at hand, but is nice for
benchmarking and in case of slight misalignment. This is clamped to being in the range of `[1, flank_size]`.
**Default:** 5
* `--num-bootstrap ###` or `-b`: Now many bootstrap re-samplings to perform. **Default:** 100
* `--sex-chr ??` or `-x`: Sex chromosome configuration. **Without this, loci in sex chromosomes will not be genotyped.**
Can be any configuration of Xs and Ys; only count matters. **Default:** *none*
Expand Down
14 changes: 10 additions & 4 deletions strkit/call/call_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,7 @@ def call_locus(
min_read_align_score = params.min_read_align_score
snv_min_base_qual = params.snv_min_base_qual
use_hp = params.use_hp
vcf_anchor_size = params.vcf_anchor_size
# ----------------------------------

rng = np.random.default_rng(seed=seed)
Expand Down Expand Up @@ -1290,9 +1291,13 @@ def get_read_length_partition_mean(p_idx: int) -> float:

read_start_anchor: str = ""
if consensus:
anchor_pair_idx, anchor_pair_found = find_pair_by_ref_pos(r_coords, left_coord_adj - 1, 0)
if anchor_pair_found:
read_start_anchor = qs[q_coords[anchor_pair_idx]:left_flank_end]
for anchor_offset in range(vcf_anchor_size, 0, -1):
# start from largest - want to include small indels if they appear immediately upstream
read_start_anchor = flank_left_seq
anchor_pair_idx, anchor_pair_found = find_pair_by_ref_pos(r_coords, left_coord_adj - anchor_offset, 0)
if anchor_pair_found:
read_start_anchor = qs[q_coords[anchor_pair_idx]:left_flank_end]
break
# otherwise, leave as blank - anchor base deleted

# ---
Expand Down Expand Up @@ -1367,7 +1372,8 @@ def get_read_length_partition_mean(p_idx: int) -> float:
n_reads_in_dict: int = len(read_dict)

locus_result.update({
**({"ref_start_anchor": ref_left_flank_seq[-1].upper(), "ref_seq": ref_seq} if consensus else {}),
# **({"ref_start_anchor": ref_left_flank_seq[-1].upper(), "ref_seq": ref_seq} if consensus else {}),
**({"ref_start_anchor": ref_left_flank_seq[-vcf_anchor_size:].upper(), "ref_seq": ref_seq} if consensus else {}),
"reads": read_dict,
})

Expand Down
41 changes: 31 additions & 10 deletions strkit/call/output/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging

from collections import Counter
# from os.path import commonprefix
from os.path import commonprefix
from pathlib import Path
from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord
from typing import Iterable, Optional
Expand Down Expand Up @@ -34,6 +34,7 @@
VCF_INFO_VT = "VT"
VCF_INFO_MOTIF = "MOTIF"
VCF_INFO_REFMC = "REFMC"
VCF_INFO_ANCH = "ANCH"

VT_STR = "str"
VT_SNV = "snv"
Expand Down Expand Up @@ -63,6 +64,7 @@ def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:

# Set up basic VCF formats
vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
vh.formats.add("ANCL", ".", "Integer", "Anchor length for each allele, five-prime of TR sequence")
vh.formats.add("DP", 1, "Integer", "Read depth")
vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
vh.formats.add("GT", 1, "String", "Genotype")
Expand All @@ -77,6 +79,7 @@ def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:
vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)")
vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string")
vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome")
vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size")

# Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO
# for iv in VCF_TR_INFO_RECORDS:
Expand Down Expand Up @@ -140,19 +143,32 @@ def output_contig_vcf_lines(
logger.error(f"Encountered None in results[{result_idx}].peaks.start_anchor_seqs: {peak_start_anchor_seqs}")
continue

peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
# anchor_offset = how many bases we can cut off from the front of the anchor
# since they're shared between all alleles.
# - we need to leave one base as an anchor for VCF compliance though, thus the min(...)
anchor_offset = min(len(common_anchor_prefix), params.vcf_anchor_size - 1)

ref_start_anchor = ref_start_anchor[anchor_offset:]
ref_seq_with_anchor = ref_start_anchor + ref_seq

seqs: tuple[str, ...] = tuple(iter_to_upper(peak_seqs))
seqs_with_anchors = tuple(zip(seqs, tuple(iter_to_upper(peak_start_anchor_seqs))))
seqs_with_anchors: list[tuple[str, str]] = list(
zip(seqs, map(lambda a: a[anchor_offset:], peak_start_anchor_seqs_upper))
)

if 0 < len(seqs) < n_alleles:
seqs = tuple([seqs[0]] * n_alleles)
seqs_with_anchors = tuple([seqs_with_anchors[0]] * n_alleles)
seqs_with_anchors = [seqs_with_anchors[0]] * n_alleles

seq_alts = sorted(
set(filter(lambda c: not (c[0] == ref_seq and c[1] == ref_start_anchor), seqs_with_anchors)),
set(filter(lambda c: not (c[1] + c[0] == ref_seq_with_anchor), seqs_with_anchors)),
key=idx_0_getter
)

# common_suffix_idx = -1 * len(commonprefix(tuple(map(_reversed_str, (ref_seq, *seqs)))))
# If there's no anchor variation, we will use the last base for a compact representation.
# Otherwise, we include the full VCF anchor sized-sequence. TODO: common prefix strip

call = result["call"]
call_95_cis = result["call_95_cis"]
Expand All @@ -163,12 +179,15 @@ def output_contig_vcf_lines(
else ()
)

# seq_alleles: list[str] = [ref_start_anchor + (ref_seq[:common_suffix_idx] if common_suffix_idx else ref_seq)]
seq_alleles: list[str] = [ref_start_anchor + ref_seq]
seq_alleles: list[str] = [ref_seq_with_anchor]

if call is not None and seq_alts:
# seq_alleles.extend(a[1] + (a[0][:common_suffix_idx] if common_suffix_idx else a[0]) for a in seq_alts)
# If we have a complete deletion, including the anchor, use a symbolic allele meaning "upstream deletion"
seq_alleles.extend((a[1] + a[0] if a[1] or a[0] else "*") for a in seq_alts)
for alt_tr_seq, alt_anchor in seq_alts:
if not alt_tr_seq and not alt_anchor:
seq_alleles.append("*")
continue
seq_alleles.append(alt_anchor + alt_tr_seq)
else:
seq_alleles.append(".")

Expand All @@ -183,13 +202,13 @@ def output_contig_vcf_lines(
vr.info[VCF_INFO_VT] = VT_STR
vr.info[VCF_INFO_MOTIF] = result["motif"]
vr.info[VCF_INFO_REFMC] = result["ref_cn"]
vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset

vr.samples[sample_id]["GT"] = (
tuple(map(seq_alleles_raw.index, seqs_with_anchors))
if call is not None and seqs
else _blank_entry(n_alleles)
)
del seq_alleles_raw

if am := result.get("assign_method"):
vr.samples[sample_id]["PM"] = am
Expand All @@ -207,6 +226,8 @@ def output_contig_vcf_lines(
vr.samples[sample_id]["MC"] = tuple(map(int, call))
vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)

vr.samples[sample_id]["ANCL"] = tuple(len(ar[1]) for ar in seq_alleles_raw if ar is not None)

# Produces a histogram-like format for read-level copy numbers
# e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7x1|8x10|9x1,8x2|9x12
vr.samples[sample_id]["MCRL"] = tuple(
Expand Down
4 changes: 4 additions & 0 deletions strkit/call/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(
respect_ref: bool = False,
count_kmers: str = "none", # "none" | "peak" | "read"
consensus: bool = False,
vcf_anchor_size: int = 5,
# ---
log_level: int = logging.WARNING,
seed: Optional[int] = None,
Expand All @@ -61,6 +62,7 @@ def __init__(
self.respect_ref: bool = respect_ref
self.count_kmers: str = count_kmers
self.consensus: bool = consensus
self.vcf_anchor_size: int = vcf_anchor_size
# ---
self.log_level: int = log_level
self.seed: Optional[int] = seed
Expand Down Expand Up @@ -112,6 +114,7 @@ def from_args(cls, logger: logging.Logger, p_args):
respect_ref=p_args.respect_ref,
count_kmers=p_args.count_kmers,
consensus=p_args.consensus or not (not p_args.vcf), # Consensus calculation is required for VCF output.
vcf_anchor_size=min(max(p_args.vcf_anchor_size, 1), p_args.flank_size),
# ---
log_level=log_levels[p_args.log_level],
seed=p_args.seed,
Expand Down Expand Up @@ -139,6 +142,7 @@ def to_dict(self, as_inputted: bool = False):
"respect_ref": self.respect_ref,
"count_kmers": self.count_kmers,
"consensus": self.consensus,
"vcf_anchor_size": self.vcf_anchor_size,
"log_level": self.log_level,
"seed": self.seed,
"processes": self.processes,
Expand Down
6 changes: 6 additions & 0 deletions strkit/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ def add_call_parser_args(call_parser):
type=str,
help="Path to write VCF-formatted calls to. STR alleles are currently formatted symbolically rather than as "
"consensus sequences.")
call_parser.add_argument(
"--vcf-anchor-size",
type=int,
default=5,
help="Size of upstream sequence (including potential small upstream variants) to include in VCF output "
"consensus sequences.")
# End VCF output arguments -----------------------------------------------------------------------------------------

call_parser.add_argument(
Expand Down

0 comments on commit 7c43188

Please sign in to comment.