From 88f4502f9c8da5d9c6ce28df5ddb289eeebc853d Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 2 Oct 2024 12:15:35 -0400 Subject: [PATCH] feat(call): output read-level copy number histograms in VCF --- docs/output_formats.md | 4 ++++ strkit/call/output/vcf.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/docs/output_formats.md b/docs/output_formats.md index 8284e12..71d7bea 100644 --- a/docs/output_formats.md +++ b/docs/output_formats.md @@ -157,6 +157,10 @@ VCF format fields (i.e., for each variant sample entry): * `GT`: Genotype * `MC`: Motif copy number for each allele (STR calls only) * `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only) +* `MCRL`: Read-level copy number histogram for each allele. Allele entries are comma-delimited, and copy numbers within + an allele's read-set are pipe (`|`)-delimited. For example, for two alleles with 8 and 9 copy-number respectively, we + may get `7:1|8:10|9:1,8:2|9:12` — the first allele has one 7-copy read, ten 8-copy reads, and one 9-copy read. The + second allele has two 8-copy reads and twelve 9-copy reads. * `PS`: Phase set * `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only) diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index 3f51d69..5afb58a 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -1,6 +1,7 @@ import functools import logging +from collections import Counter # from os.path import commonprefix from pathlib import Path from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord @@ -62,6 +63,7 @@ def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader: vh.formats.add("GT", 1, "String", "Genotype") vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele") vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele") + vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele") vh.formats.add("PS", 1, "Integer", "Phase set") vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)") @@ -194,6 +196,22 @@ def output_contig_vcf_lines( vr.samples[sample_id]["MC"] = tuple(map(int, call)) vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis) + # Produces a histogram-like format for read-level copy numbers + # e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7:1|8:10|9:1,8:2|9:12 + vr.samples[sample_id]["MCRL"] = tuple( + "|".join( + map( + lambda pair: ":".join(map(str, pair)), + sorted( + Counter( + map(lambda r: r["cn"], filter(lambda r: r.get("p") == pi, res_reads.values())) + ).items() + ) + ) + ) + for pi in range(res_peaks["modal_n"]) + ) + ps = result["ps"] try: