feat(call): include larger VCF anchors - capture 5' variation

davidlougheed · Nov 19, 2024 · 7c43188 · 7c43188
1 parent 155b0ff
commit 7c43188
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 14 deletions.
diff --git a/docs/caller_usage.md b/docs/caller_usage.md
@@ -48,6 +48,10 @@
 * `--consensus` or `-c`: Turn on consensus calculation for alleles. This adds runtime, but gives a better idea of STR 
   structure and is useful for comparing alleles beyond copy number. If `--vcf` is set, this option is forced on. 
   **Default:** off
+* `--vcf-anchor-size`: Number of bases upstream (5') of the tandem repeat to include in the VCF output. This can include
+  small indels, and having a size above `1` may be beneficial or detrimental to the use case at hand, but is nice for 
+  benchmarking and in case of slight misalignment. This is clamped to being in the range of `[1, flank_size]`.
+  **Default:** 5
 * `--num-bootstrap ###` or `-b`: Now many bootstrap re-samplings to perform. **Default:** 100
 * `--sex-chr ??` or `-x`: Sex chromosome configuration. **Without this, loci in sex chromosomes will not be genotyped.**
   Can be any configuration of Xs and Ys; only count matters. **Default:** *none*

diff --git a/strkit/call/call_locus.py b/strkit/call/call_locus.py
@@ -839,6 +839,7 @@ def call_locus(
     min_read_align_score = params.min_read_align_score
     snv_min_base_qual = params.snv_min_base_qual
     use_hp = params.use_hp
+    vcf_anchor_size = params.vcf_anchor_size
     # ----------------------------------
 
     rng = np.random.default_rng(seed=seed)
@@ -1290,9 +1291,13 @@ def get_read_length_partition_mean(p_idx: int) -> float:
 
         read_start_anchor: str = ""
         if consensus:
-            anchor_pair_idx, anchor_pair_found = find_pair_by_ref_pos(r_coords, left_coord_adj - 1, 0)
-            if anchor_pair_found:
-                read_start_anchor = qs[q_coords[anchor_pair_idx]:left_flank_end]
+            for anchor_offset in range(vcf_anchor_size, 0, -1):
+                # start from largest - want to include small indels if they appear immediately upstream
+                read_start_anchor = flank_left_seq
+                anchor_pair_idx, anchor_pair_found = find_pair_by_ref_pos(r_coords, left_coord_adj - anchor_offset, 0)
+                if anchor_pair_found:
+                    read_start_anchor = qs[q_coords[anchor_pair_idx]:left_flank_end]
+                    break
             # otherwise, leave as blank - anchor base deleted
 
         # ---
@@ -1367,7 +1372,8 @@ def get_read_length_partition_mean(p_idx: int) -> float:
     n_reads_in_dict: int = len(read_dict)
 
     locus_result.update({
-        **({"ref_start_anchor": ref_left_flank_seq[-1].upper(), "ref_seq": ref_seq} if consensus else {}),
+        # **({"ref_start_anchor": ref_left_flank_seq[-1].upper(), "ref_seq": ref_seq} if consensus else {}),
+        **({"ref_start_anchor": ref_left_flank_seq[-vcf_anchor_size:].upper(), "ref_seq": ref_seq} if consensus else {}),
         "reads": read_dict,
     })
 

diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
@@ -2,7 +2,7 @@
 import logging
 
 from collections import Counter
-# from os.path import commonprefix
+from os.path import commonprefix
 from pathlib import Path
 from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord
 from typing import Iterable, Optional
@@ -34,6 +34,7 @@
 VCF_INFO_VT = "VT"
 VCF_INFO_MOTIF = "MOTIF"
 VCF_INFO_REFMC = "REFMC"
+VCF_INFO_ANCH = "ANCH"
 
 VT_STR = "str"
 VT_SNV = "snv"
@@ -63,6 +64,7 @@ def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:
 
     # Set up basic VCF formats
     vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
+    vh.formats.add("ANCL", ".", "Integer", "Anchor length for each allele, five-prime of TR sequence")
     vh.formats.add("DP", 1, "Integer", "Read depth")
     vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
     vh.formats.add("GT", 1, "String", "Genotype")
@@ -77,6 +79,7 @@ def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:
     vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)")
     vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string")
     vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome")
+    vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size")
 
     # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
     # for iv in VCF_TR_INFO_RECORDS:
@@ -140,19 +143,32 @@ def output_contig_vcf_lines(
             logger.error(f"Encountered None in results[{result_idx}].peaks.start_anchor_seqs: {peak_start_anchor_seqs}")
             continue
 
+        peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
+        common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
+        # anchor_offset = how many bases we can cut off from the front of the anchor
+        # since they're shared between all alleles.
+        #  - we need to leave one base as an anchor for VCF compliance though, thus the min(...)
+        anchor_offset = min(len(common_anchor_prefix), params.vcf_anchor_size - 1)
+
+        ref_start_anchor = ref_start_anchor[anchor_offset:]
+        ref_seq_with_anchor = ref_start_anchor + ref_seq
+
         seqs: tuple[str, ...] = tuple(iter_to_upper(peak_seqs))
-        seqs_with_anchors = tuple(zip(seqs, tuple(iter_to_upper(peak_start_anchor_seqs))))
+        seqs_with_anchors: list[tuple[str, str]] = list(
+            zip(seqs, map(lambda a: a[anchor_offset:], peak_start_anchor_seqs_upper))
+        )
 
         if 0 < len(seqs) < n_alleles:
             seqs = tuple([seqs[0]] * n_alleles)
-            seqs_with_anchors = tuple([seqs_with_anchors[0]] * n_alleles)
+            seqs_with_anchors = [seqs_with_anchors[0]] * n_alleles
 
         seq_alts = sorted(
-            set(filter(lambda c: not (c[0] == ref_seq and c[1] == ref_start_anchor), seqs_with_anchors)),
+            set(filter(lambda c: not (c[1] + c[0] == ref_seq_with_anchor), seqs_with_anchors)),
             key=idx_0_getter
         )
 
-        # common_suffix_idx = -1 * len(commonprefix(tuple(map(_reversed_str, (ref_seq, *seqs)))))
+        # If there's no anchor variation, we will use the last base for a compact representation.
+        # Otherwise, we include the full VCF anchor sized-sequence.  TODO: common prefix strip
 
         call = result["call"]
         call_95_cis = result["call_95_cis"]
@@ -163,12 +179,15 @@ def output_contig_vcf_lines(
             else ()
         )
 
-        # seq_alleles: list[str] = [ref_start_anchor + (ref_seq[:common_suffix_idx] if common_suffix_idx else ref_seq)]
-        seq_alleles: list[str] = [ref_start_anchor + ref_seq]
+        seq_alleles: list[str] = [ref_seq_with_anchor]
+
         if call is not None and seq_alts:
-            # seq_alleles.extend(a[1] + (a[0][:common_suffix_idx] if common_suffix_idx else a[0]) for a in seq_alts)
             # If we have a complete deletion, including the anchor, use a symbolic allele meaning "upstream deletion"
-            seq_alleles.extend((a[1] + a[0] if a[1] or a[0] else "*") for a in seq_alts)
+            for alt_tr_seq, alt_anchor in seq_alts:
+                if not alt_tr_seq and not alt_anchor:
+                    seq_alleles.append("*")
+                    continue
+                seq_alleles.append(alt_anchor + alt_tr_seq)
         else:
             seq_alleles.append(".")
 
@@ -183,13 +202,13 @@ def output_contig_vcf_lines(
         vr.info[VCF_INFO_VT] = VT_STR
         vr.info[VCF_INFO_MOTIF] = result["motif"]
         vr.info[VCF_INFO_REFMC] = result["ref_cn"]
+        vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset
 
         vr.samples[sample_id]["GT"] = (
             tuple(map(seq_alleles_raw.index, seqs_with_anchors))
             if call is not None and seqs
             else _blank_entry(n_alleles)
         )
-        del seq_alleles_raw
 
         if am := result.get("assign_method"):
             vr.samples[sample_id]["PM"] = am
@@ -207,6 +226,8 @@ def output_contig_vcf_lines(
             vr.samples[sample_id]["MC"] = tuple(map(int, call))
             vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)
 
+            vr.samples[sample_id]["ANCL"] = tuple(len(ar[1]) for ar in seq_alleles_raw if ar is not None)
+
             # Produces a histogram-like format for read-level copy numbers
             # e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7x1|8x10|9x1,8x2|9x12
             vr.samples[sample_id]["MCRL"] = tuple(

diff --git a/strkit/call/params.py b/strkit/call/params.py
@@ -36,6 +36,7 @@ def __init__(
         respect_ref: bool = False,
         count_kmers: str = "none",  # "none" | "peak" | "read"
         consensus: bool = False,
+        vcf_anchor_size: int = 5,
         # ---
         log_level: int = logging.WARNING,
         seed: Optional[int] = None,
@@ -61,6 +62,7 @@ def __init__(
         self.respect_ref: bool = respect_ref
         self.count_kmers: str = count_kmers
         self.consensus: bool = consensus
+        self.vcf_anchor_size: int = vcf_anchor_size
         # ---
         self.log_level: int = log_level
         self.seed: Optional[int] = seed
@@ -112,6 +114,7 @@ def from_args(cls, logger: logging.Logger, p_args):
             respect_ref=p_args.respect_ref,
             count_kmers=p_args.count_kmers,
             consensus=p_args.consensus or not (not p_args.vcf),  # Consensus calculation is required for VCF output.
+            vcf_anchor_size=min(max(p_args.vcf_anchor_size, 1), p_args.flank_size),
             # ---
             log_level=log_levels[p_args.log_level],
             seed=p_args.seed,
@@ -139,6 +142,7 @@ def to_dict(self, as_inputted: bool = False):
             "respect_ref": self.respect_ref,
             "count_kmers": self.count_kmers,
             "consensus": self.consensus,
+            "vcf_anchor_size": self.vcf_anchor_size,
             "log_level": self.log_level,
             "seed": self.seed,
             "processes": self.processes,

diff --git a/strkit/entry.py b/strkit/entry.py
@@ -177,6 +177,12 @@ def add_call_parser_args(call_parser):
         type=str,
         help="Path to write VCF-formatted calls to. STR alleles are currently formatted symbolically rather than as "
              "consensus sequences.")
+    call_parser.add_argument(
+        "--vcf-anchor-size",
+        type=int,
+        default=5,
+        help="Size of upstream sequence (including potential small upstream variants) to include in VCF output "
+             "consensus sequences.")
     # End VCF output arguments -----------------------------------------------------------------------------------------
 
     call_parser.add_argument(