Skip to content

Commit

Permalink
feat(call): output read-level copy number histograms in VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Oct 2, 2024
1 parent c4cfef6 commit 88f4502
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 0 deletions.
4 changes: 4 additions & 0 deletions docs/output_formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ VCF format fields (i.e., for each variant sample entry):
* `GT`: Genotype
* `MC`: Motif copy number for each allele (STR calls only)
* `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only)
* `MCRL`: Read-level copy number histogram for each allele. Allele entries are comma-delimited, and copy numbers within
an allele's read-set are pipe (`|`)-delimited. For example, for two alleles with 8 and 9 copy-number respectively, we
may get `7:1|8:10|9:1,8:2|9:12` — the first allele has one 7-copy read, ten 8-copy reads, and one 9-copy read. The
second allele has two 8-copy reads and twelve 9-copy reads.
* `PS`: Phase set
* `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only)

Expand Down
18 changes: 18 additions & 0 deletions strkit/call/output/vcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import functools
import logging

from collections import Counter
# from os.path import commonprefix
from pathlib import Path
from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord
Expand Down Expand Up @@ -62,6 +63,7 @@ def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:
vh.formats.add("GT", 1, "String", "Genotype")
vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
vh.formats.add("PS", 1, "Integer", "Phase set")
vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")

Expand Down Expand Up @@ -194,6 +196,22 @@ def output_contig_vcf_lines(
vr.samples[sample_id]["MC"] = tuple(map(int, call))
vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)

# Produces a histogram-like format for read-level copy numbers
# e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7:1|8:10|9:1,8:2|9:12
vr.samples[sample_id]["MCRL"] = tuple(
"|".join(
map(
lambda pair: ":".join(map(str, pair)),
sorted(
Counter(
map(lambda r: r["cn"], filter(lambda r: r.get("p") == pi, res_reads.values()))
).items()
)
)
)
for pi in range(res_peaks["modal_n"])
)

ps = result["ps"]

try:
Expand Down

0 comments on commit 88f4502

Please sign in to comment.