Skip to content

Commit

Permalink
Updated Cyrius to call all star alleles (up to *139) (#3)
Browse files Browse the repository at this point in the history
Removed the --knownFunction and --includeNewStar options.
  • Loading branch information
xiao-chen-xc authored Jul 24, 2020
1 parent caa0fe5 commit ed35cde
Show file tree
Hide file tree
Showing 39 changed files with 598 additions and 1,097 deletions.
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
# Cyrius: WGS-based CYP2D6 genotyper
Cyrius is a tool to genotype CYP2D6 from a whole-genome sequencing (WGS) BAM file. Cyrius uses a novel method to solve the problems caused by the high sequence similarity with the pseudogene paralog CYP2D7 and thus is able to detect all star alleles, particularly those that contain structural variants, accurately. Please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2020.05.05.077966v1) for details about the method.
Cyrius is a tool to genotype CYP2D6 from a whole-genome sequencing (WGS) BAM file. Cyrius uses a novel method to solve the problems caused by the high sequence similarity with the pseudogene paralog CYP2D7 and thus is able to detect all star alleles, particularly those that contain structural variants, accurately. Please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2020.05.05.077966v2) for details about the method.

## Running the program

This Python3 program can be run as follows:
```bash
star_caller.py --manifest MANIFEST_FILE \
--genome [19/37/38] \
--prefix OUTPUT_FILE_PREFIX \
--outDir OUTPUT_DIRECTORY \
--threads NUMBER_THREADS
python3 star_caller.py --manifest MANIFEST_FILE \
--genome [19/37/38] \
--prefix OUTPUT_FILE_PREFIX \
--outDir OUTPUT_DIRECTORY \
--threads NUMBER_THREADS
```
The manifest is a text file in which each line should list the absolute path to an input BAM/CRAM file.
For CRAM input, it’s suggested to provide the path to the reference fasta file with `--reference` in the command.
Additionally, there is an option `--knownFunction` to call only star alleles with known functions, as well as an option `--includeNewStar` to call all star alleles including the newly added, uncurated ones (\*115-\*139) in PharmVar.
For CRAM input, it’s suggested to provide the path to the reference fasta file with `--reference` in the command.

## Interpreting the output

Expand Down
104 changes: 94 additions & 10 deletions caller/call_cn.py → caller/call_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,15 @@
process_raw_call_gc,
process_raw_call_denovo,
)
from depth_calling.haplotype import get_haplotypes_from_bam, extract_hap
from depth_calling.haplotype import (
get_haplotypes_from_bam,
get_haplotypes_from_bam_single_region,
extract_hap,
)
from depth_calling.snp_count import (
get_supporting_reads,
get_supporting_reads_single_region,
)


INTRON1_BP_APPROX = 42130500
Expand Down Expand Up @@ -93,6 +101,11 @@
"g.42129042T>C",
"g.42129174C>A",
"g.42129180A>T",
"g.42127526C>T",
"g.42128325A>G",
"g.42126877G>A",
"g.42127973T>C",
"g.42127556T>C",
]


Expand Down Expand Up @@ -197,7 +210,7 @@ def good_read(read):
return read.is_secondary == 0 and read.is_supplementary == 0


def get_allele_counts_42128936(bamfile_handle, genome):
def get_allele_counts_var42128936(bamfile_handle, genome):
"""
Search for the inserstions at 42128936 defining
*30/*40/*58 in read sequences
Expand All @@ -223,6 +236,23 @@ def get_allele_counts_42128936(bamfile_handle, genome):
return (ref_read, long_ins_read, short_ins_read)


def update_var42128936(
var_list, var_alt, var_ref, ref_read, long_ins_read, short_ins_read
):
"""
Update variant read counts for g42128936.
"""
if "g.42128936-42128937insGGGGCGAAAGGGGCGAAA" in var_list:
long_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAAGGGGCGAAA")
var_alt[long_ins_index] = long_ins_read
var_ref[long_ins_index] = short_ins_read + ref_read
if "g.42128936-42128937insGGGGCGAAA" in var_list:
short_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAA")
var_alt[short_ins_index] = short_ins_read
var_ref[short_ins_index] = long_ins_read + ref_read
return var_alt, var_ref


def call_exon9gc(d6_count, d7_count, full_length_cn):
"""
Call exon 9 conversion
Expand Down Expand Up @@ -257,28 +287,82 @@ def call_exon9gc(d6_count, d7_count, full_length_cn):
return None


def call_var42126938(bamfile, cnvtag, site42126938, base_db, target_positions):
def call_var42126938(bamfile, full_length_cn, base_db):
"""
Call variant g.42126938C>T (gene conversion variant in homology region)
Call variant g.42126938C>T (gene conversion variant in homology region)
based on read depth and phased haplotypes
"""
dcn = {"star5": 3, "cn2": 4}
assert cnvtag in dcn
full_length_cn = dcn[cnvtag]
d6_cn = call_cn_snp(full_length_cn, [site42126938[0]], [site42126938[1]], 0.8)[0]
var_called = []
# Whether g.42126938C>T is on the same haplotype as g.42126611C>G
G_haplotype = False
snp_d6, snp_d7 = get_supporting_reads(
bamfile, base_db.dsnp1, base_db.dsnp2, base_db.nchr, base_db.dindex
)
d6_d7_base_count = [snp_d6[-1], snp_d7[-1]]
d6_cn = call_cn_snp(
full_length_cn, [d6_d7_base_count[0]], [d6_d7_base_count[1]], 0.8
)[0]
if d6_cn is not None and d6_cn < full_length_cn - 2:
haplotype_per_read = get_haplotypes_from_bam(bamfile, base_db, target_positions)
haplotype_per_read = get_haplotypes_from_bam(
bamfile, base_db, range(len(base_db.dsnp1))
)
recombinant_read_count = extract_hap(haplotype_per_read, [0, 2])
if "12" in recombinant_read_count and sum(recombinant_read_count["12"]) > 1:
G_hap_count = extract_hap(haplotype_per_read, [1, 2])
for _ in range(full_length_cn - 2 - d6_cn):
var_called.append("g.42126938C>T")
if "12" in G_hap_count and sum(G_hap_count["12"]) > 1:
G_haplotype = True
return var_called, G_haplotype
return d6_d7_base_count, var_called, G_haplotype


def call_var42127526_var42127556(bamfile, cnvtag, base_db):
"""
Call variant g.42127526C>T (gene conversion variant in homology region)
based on read depth and phased haplotypes
"""
var_called = []
var_ref, var_alt, var_ref_forward, var_ref_reverse = get_supporting_reads_single_region(
bamfile, base_db.dsnp1, base_db.nchr, base_db.dindex
)
var7526_count = [var_ref[0], var_alt[0]]
var7556_count = [var_ref[1], var_alt[1]]
if cnvtag in CNVTAG_LOOKUP_TABLE:
d6_cn = CNVTAG_LOOKUP_TABLE[cnvtag].exon9_to_intron1
var7526_cn = call_cn_snp(d6_cn, [var7526_count[1]], [var7526_count[0]])[0]
var7556_cn = call_cn_snp(d6_cn, [var7556_count[1]], [var7556_count[0]])[0]
haplotype_per_read = get_haplotypes_from_bam_single_region(
bamfile, base_db, range(len(base_db.dsnp1))
)
recombinant_read_count = extract_hap(haplotype_per_read, [0, 1, 2])
if "211" in recombinant_read_count and sum(recombinant_read_count["211"]) > 1:
for _ in range(var7526_cn):
var_called.append("g.42127526C>T")
elif "221" in recombinant_read_count and sum(recombinant_read_count["221"]) > 1:
for _ in range(min(var7526_cn, var7556_cn)):
var_called.append("g.42127526C>T")
var_called.append("g.42127556T>C")
return var7526_count, var7556_count, var_called


def call_var42127803hap(bamfile, cnvtag, base_db):
"""
Call haplotype with regard to g.42127803C>T and g.42127941G>A
"""
diff_haplotype = False
if cnvtag == "cn2":
haplotype_per_read = get_haplotypes_from_bam_single_region(
bamfile, base_db, range(len(base_db.dsnp1))
)
recombinant_read_count = extract_hap(haplotype_per_read, [0, 1])
if (
"12" in recombinant_read_count
and sum(recombinant_read_count["12"]) > 1
and "21" in recombinant_read_count
and sum(recombinant_read_count["21"]) > 1
):
diff_haplotype = True
return diff_haplotype


def get_called_variants(var_list, cn_prob_processed, starting_index=0):
Expand Down
5 changes: 5 additions & 0 deletions caller/cnv_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_
and exon9gc_call_stringent <= exon9_intron4_sites_consensus
):
exon9region_sites_consensus = exon9gc_call_stringent
elif (
exon9region_sites_consensus > exon9gc_call_stringent
and exon9gc_call_stringent >= exon9_intron4_sites_consensus
):
exon9region_sites_consensus = exon9gc_call_stringent
else:
exon9region_sites = [
a
Expand Down
6 changes: 3 additions & 3 deletions caller/construct_star_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@


# Exon 9 gene conversion
EXON9GC_ALLELES = ["*36", "*4N", "*57", "*83"]
EXON9GC_PAIR_ALLELES = {"*36": "*10", "*4N": "*4A"}
EXON9GC_ALLELES = ["*36", "*4.013", "*57", "*83"]
EXON9GC_PAIR_ALLELES = {"*36": "*10", "*4.013": "*4"}


def make_hap_dic(variant_list, star_set, hap_dic):
Expand Down Expand Up @@ -51,7 +51,7 @@ def get_hap_table(hap_table):
for line in f:
at = line.strip().split()
star_id = at[0]
variant_list = sorted(at[1:-2])
variant_list = sorted(at[1:-1])
var_list_joined = "_".join(variant_list)
dhap.setdefault(var_list_joined, star_id)
dstar.setdefault(star_id, var_list_joined)
Expand Down
Loading

0 comments on commit ed35cde

Please sign in to comment.