Skip to content

Commit

Permalink
feat(call): subset reads with long tracts for consensus
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Dec 20, 2024
1 parent 1b02d12 commit b93bcf3
Showing 1 changed file with 11 additions and 9 deletions.
20 changes: 11 additions & 9 deletions strkit/call/call_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,12 @@
significant_clip_threshold = 100
significant_clip_snv_take_in = 250

# above large_consensus_length, the number of reads used for consensus is limited to max_n_large_consensus_reads
large_consensus_length: int = 2000
max_n_large_consensus_reads: int = 20

# maximum median number of bases before we can't use POA for consensus anymore due to performance:
max_mdn_poa_length = 5000
max_mdn_poa_length: int = 5000


# property getters & other partials
Expand Down Expand Up @@ -1563,14 +1567,12 @@ def get_read_length_partition_mean(p_idx: int) -> float:

if call_data and consensus:
def _consensi_for_key(k: Literal["_tr_seq", "_start_anchor"]):
return map(
lambda a: consensus_seq(
list(map(lambda rr: read_dict_extra[rr][k], a)),
logger_,
max_mdn_poa_length,
),
allele_reads,
)
for a in allele_reads:
seqs = list(map(lambda rr: read_dict_extra[rr][k], a))
if seqs and len(seqs[0]) > large_consensus_length:
# if we're dealing with large sequences, use a subset of the reads to prevent stalling out.
seqs = seqs[:max_n_large_consensus_reads]
yield consensus_seq(seqs, logger_, max_mdn_poa_length)

call_seqs.extend(_consensi_for_key("_tr_seq"))
call_anchor_seqs.extend(_consensi_for_key("_start_anchor"))
Expand Down

0 comments on commit b93bcf3

Please sign in to comment.