feat(call): don't skip supplemental reads by default

adds options for skipping supplemental and secondary alignments.
davidlougheed · Dec 13, 2024 · b3eb2ed · b3eb2ed
1 parent a357704
commit b3eb2ed
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 7 deletions.
diff --git a/docs/caller_usage.md b/docs/caller_usage.md
@@ -26,6 +26,8 @@
   Use for CCS reads or similar data (e.g., R10 nanopore data) ONLY! **Default:** off
 * `--use-hp`: Whether to incorporate `HP` tags from a haplotagged alignment file. This should speed up runtime and 
   will potentially improve calling results. **This flag is experimental, and has not been tested extensively.**
+* `--skip-supplementary` or `--skip-supp`: Skip supplementary alignments. **Default:** off
+* `--skip-secondary` or `--skip-sec`: Skip secondary alignments. **Default:** off
 * `--incorporate-snvs [path]` or `--snv [path]`: A path to a VCF with SNVs to incorporate into the calling process and 
   final output. This file is just used as an SNV loci catalog; STRkit itself will perform the SNV calling. Empirically 
   improves calling quality a small amount, speeds up runtime, and gives nearby SNV calls for downstream analysis.

diff --git a/requirements.txt b/requirements.txt
@@ -27,7 +27,7 @@ scikit-learn==1.4.2
 scipy==1.13.1
 six==1.16.0
 statsmodels==0.14.3
-strkit_rust_ext==0.18.3
+strkit_rust_ext==0.19.0
 threadpoolctl==3.4.0
 tomli==2.0.1
 tzdata==2024.2

diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
         "scikit-learn>=1.2.1,<1.6",
         "scipy>=1.10,<1.14",
         "statsmodels>=0.14.0,<0.15",
-        "strkit_rust_ext==0.18.3",
+        "strkit_rust_ext==0.19.0",
     ],
 
     description="A toolkit for analyzing variation in short(ish) tandem repeats.",

diff --git a/strkit/call/call_locus.py b/strkit/call/call_locus.py
@@ -247,7 +247,7 @@ def call_alleles_with_haplotags(
     cdd: list[CallDict] = []
 
     for hi, hp in enumerate(haplotags):
-        cc: Optional[CallDict] = call_alleles(
+        cc: CallDict | None = call_alleles(
             cns[hi], EMPTY_NP_ARRAY,  # Don't bother separating by strand for now...
             c_ws[hi], (),
             params=params,
@@ -928,8 +928,7 @@ def call_locus(
         chimeric_read_status,
         left_most_coord,
         right_most_coord,
-    ) = bf.get_overlapping_segments_and_related_data(
-        read_contig, left_flank_coord, right_flank_coord, max_reads, logger_, locus_log_str)
+    ) = bf.get_overlapping_segments_and_related_data(read_contig, left_flank_coord, right_flank_coord, locus_log_str)
 
     logger_.debug("%s - got %d overlapping aligned segments", locus_log_str, n_overlapping_reads)
 
@@ -1083,7 +1082,7 @@ def get_read_length_partition_mean(p_idx: int) -> float:
             )
             continue
 
-        # -----
+        # --------------------------------------------------------------------------------------------------------------
 
         # Truncate to flank_size (plus some leeway for small indels in flanking region) to stop relatively distant
         # expansion sequences from accidentally being included in the flanking region; e.g. if the insert gets mapped

diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py
@@ -94,7 +94,16 @@ def locus_worker(
     sample_id = params.sample_id
 
     ref = FastaFile(params.reference_file)
-    bf = STRkitBAMReader(params.read_file, params.reference_file)
+    bf = STRkitBAMReader(
+        params.read_file,
+        params.reference_file,
+        params.max_reads,
+        params.skip_supplementary,
+        params.skip_secondary,
+        params.use_hp,
+        lg,
+        params.log_level == logging.DEBUG,
+    )
 
     snv_vcf_contigs: list[str] = []
     if params.snv_vcf:

diff --git a/strkit/call/params.py b/strkit/call/params.py
@@ -25,6 +25,8 @@ def __init__(
         min_read_align_score: float = 0.9,
         num_bootstrap: int = 100,
         flank_size: int = 70,
+        skip_supplementary: bool = False,
+        skip_secondary: bool = False,
         sex_chroms: str | None = None,
         realign: bool = False,
         hq: bool = False,
@@ -51,6 +53,8 @@ def __init__(
         self.min_read_align_score: float = min_read_align_score
         self.num_bootstrap: int = num_bootstrap
         self.flank_size: int = flank_size
+        self.skip_supplementary: bool = skip_supplementary
+        self.skip_secondary: bool = skip_secondary
         self.sex_chroms: str | None = sex_chroms
         self.realign: bool = realign
         self.hq: bool = hq
@@ -103,6 +107,8 @@ def from_args(cls, logger: logging.Logger, p_args):
             min_read_align_score=p_args.min_read_align_score,
             num_bootstrap=p_args.num_bootstrap,
             flank_size=p_args.flank_size,
+            skip_supplementary=p_args.skip_supplementary,
+            skip_secondary=p_args.skip_secondary,
             sex_chroms=p_args.sex_chr,
             realign=p_args.realign,
             hq=p_args.hq,
@@ -131,6 +137,8 @@ def to_dict(self, as_inputted: bool = False):
             "min_read_align_score": self.min_read_align_score,
             "num_bootstrap": self.num_bootstrap,
             "flank_size": self.flank_size,
+            "skip_supplementary": self.skip_supplementary,
+            "skip_secondary": self.skip_secondary,
             "sample_id": self._sample_id_orig if as_inputted else self.sample_id,
             "realign": self.realign,
             "hq": self.hq,

diff --git a/strkit/entry.py b/strkit/entry.py
@@ -136,6 +136,16 @@ def add_call_parser_args(call_parser):
         default=70,
         help="Number of bases around the locus to use for context.")
 
+    call_parser.add_argument(
+        "--skip-supplementary", "--skip-supp",
+        action="store_true",
+        help="Whether to skip supplementary aligned reads.")
+
+    call_parser.add_argument(
+        "--skip-secondary", "--skip-sec",
+        action="store_true",
+        help="Whether to skip secondary aligned reads.")
+
     call_parser.add_argument(
         "--processes", "-p",
         type=int,