From 88f4502f9c8da5d9c6ce28df5ddb289eeebc853d Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 2 Oct 2024 12:15:35 -0400
Subject: [PATCH] feat(call): output read-level copy number histograms in VCF

---
 docs/output_formats.md    |  4 ++++
 strkit/call/output/vcf.py | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/docs/output_formats.md b/docs/output_formats.md
index 8284e12..71d7bea 100644
--- a/docs/output_formats.md
+++ b/docs/output_formats.md
@@ -157,6 +157,10 @@ VCF format fields (i.e., for each variant sample entry):
 * `GT`: Genotype
 * `MC`: Motif copy number for each allele (STR calls only)
 * `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only)
+* `MCRL`: Read-level copy number histogram for each allele. Allele entries are comma-delimited, and copy numbers within 
+  an allele's read-set are pipe (`|`)-delimited. For example, for two alleles with 8 and 9 copy-number respectively, we 
+  may get `7:1|8:10|9:1,8:2|9:12` — the first allele has one 7-copy read, ten 8-copy reads, and one 9-copy read. The 
+  second allele has two 8-copy reads and twelve 9-copy reads.
 * `PS`: Phase set
 * `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only)
 
diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index 3f51d69..5afb58a 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -1,6 +1,7 @@
 import functools
 import logging
 
+from collections import Counter
 # from os.path import commonprefix
 from pathlib import Path
 from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord
@@ -62,6 +63,7 @@ def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:
     vh.formats.add("GT", 1, "String", "Genotype")
     vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
     vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
+    vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
     vh.formats.add("PS", 1, "Integer", "Phase set")
     vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
 
@@ -194,6 +196,22 @@ def output_contig_vcf_lines(
             vr.samples[sample_id]["MC"] = tuple(map(int, call))
             vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)
 
+            # Produces a histogram-like format for read-level copy numbers
+            # e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7:1|8:10|9:1,8:2|9:12
+            vr.samples[sample_id]["MCRL"] = tuple(
+                "|".join(
+                    map(
+                        lambda pair: ":".join(map(str, pair)),
+                        sorted(
+                            Counter(
+                                map(lambda r: r["cn"], filter(lambda r: r.get("p") == pi, res_reads.values()))
+                            ).items()
+                        )
+                    )
+                )
+                for pi in range(res_peaks["modal_n"])
+            )
+
             ps = result["ps"]
 
             try: