From ed35cde6567993d53024f547337c721fc81a8ddd Mon Sep 17 00:00:00 2001 From: Xiao Chen Date: Thu, 23 Jul 2020 22:46:31 -0700 Subject: [PATCH] Updated Cyrius to call all star alleles (up to *139) (#3) Removed the --knownFunction and --includeNewStar options. --- README.md | 15 +- caller/{call_cn.py => call_variants.py} | 104 +++++++- caller/cnv_hybrid.py | 5 + caller/construct_star_table.py | 6 +- caller/match_star_allele.py | 229 ++++++++++-------- ...{test_call_cn.py => test_call_variants.py} | 8 +- caller/tests/test_match_star.py | 67 +++-- data/CYP2D6_SNP_19.txt | 4 +- data/CYP2D6_SNP_37.txt | 4 +- data/CYP2D6_SNP_38.txt | 4 +- data/CYP2D6_haplotype_19.txt | 9 + data/CYP2D6_haplotype_37.txt | 9 + data/CYP2D6_haplotype_38.txt | 9 + .../CYP2D6_target_variant_19.txt | 4 + .../CYP2D6_target_variant_37.txt | 4 + .../CYP2D6_target_variant_38.txt | 4 + ...P2D6_target_variant_homology_region_19.txt | 2 +- ...P2D6_target_variant_homology_region_37.txt | 2 +- ...P2D6_target_variant_homology_region_38.txt | 2 +- .../CYP2D6_target_variant_19.txt | 80 ------ .../CYP2D6_target_variant_37.txt | 80 ------ .../CYP2D6_target_variant_38.txt | 80 ------ ...P2D6_target_variant_homology_region_19.txt | 18 -- ...P2D6_target_variant_homology_region_37.txt | 18 -- ...P2D6_target_variant_homology_region_38.txt | 18 -- data/full_star_table/star_table.txt | 115 --------- data/include_new_star_table/star_table.txt | 133 ---------- .../CYP2D6_target_variant_19.txt | 48 ---- .../CYP2D6_target_variant_37.txt | 48 ---- .../CYP2D6_target_variant_38.txt | 48 ---- ...P2D6_target_variant_homology_region_19.txt | 10 - ...P2D6_target_variant_homology_region_37.txt | 10 - ...P2D6_target_variant_homology_region_38.txt | 10 - data/known_function_star_table/star_table.txt | 65 ----- data/star_table.txt | 146 +++++++++++ depth_calling/haplotype.py | 27 ++- depth_calling/snp_count.py | 68 +++--- depth_calling/tests/test_snp_count.py | 12 +- star_caller.py | 170 ++++++------- 39 files changed, 598 insertions(+), 1097 deletions(-) rename caller/{call_cn.py => call_variants.py} (72%) rename caller/tests/{test_call_cn.py => test_call_variants.py} (93%) create mode 100644 data/CYP2D6_haplotype_19.txt create mode 100644 data/CYP2D6_haplotype_37.txt create mode 100644 data/CYP2D6_haplotype_38.txt rename data/{include_new_star_table => }/CYP2D6_target_variant_19.txt (96%) rename data/{include_new_star_table => }/CYP2D6_target_variant_37.txt (96%) rename data/{include_new_star_table => }/CYP2D6_target_variant_38.txt (96%) rename data/{include_new_star_table => }/CYP2D6_target_variant_homology_region_19.txt (94%) rename data/{include_new_star_table => }/CYP2D6_target_variant_homology_region_37.txt (94%) rename data/{include_new_star_table => }/CYP2D6_target_variant_homology_region_38.txt (94%) delete mode 100644 data/full_star_table/CYP2D6_target_variant_19.txt delete mode 100644 data/full_star_table/CYP2D6_target_variant_37.txt delete mode 100644 data/full_star_table/CYP2D6_target_variant_38.txt delete mode 100644 data/full_star_table/CYP2D6_target_variant_homology_region_19.txt delete mode 100644 data/full_star_table/CYP2D6_target_variant_homology_region_37.txt delete mode 100644 data/full_star_table/CYP2D6_target_variant_homology_region_38.txt delete mode 100644 data/full_star_table/star_table.txt delete mode 100644 data/include_new_star_table/star_table.txt delete mode 100644 data/known_function_star_table/CYP2D6_target_variant_19.txt delete mode 100644 data/known_function_star_table/CYP2D6_target_variant_37.txt delete mode 100644 data/known_function_star_table/CYP2D6_target_variant_38.txt delete mode 100644 data/known_function_star_table/CYP2D6_target_variant_homology_region_19.txt delete mode 100644 data/known_function_star_table/CYP2D6_target_variant_homology_region_37.txt delete mode 100644 data/known_function_star_table/CYP2D6_target_variant_homology_region_38.txt delete mode 100644 data/known_function_star_table/star_table.txt create mode 100644 data/star_table.txt diff --git a/README.md b/README.md index ad28d74..ad2013b 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,18 @@ # Cyrius: WGS-based CYP2D6 genotyper -Cyrius is a tool to genotype CYP2D6 from a whole-genome sequencing (WGS) BAM file. Cyrius uses a novel method to solve the problems caused by the high sequence similarity with the pseudogene paralog CYP2D7 and thus is able to detect all star alleles, particularly those that contain structural variants, accurately. Please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2020.05.05.077966v1) for details about the method. +Cyrius is a tool to genotype CYP2D6 from a whole-genome sequencing (WGS) BAM file. Cyrius uses a novel method to solve the problems caused by the high sequence similarity with the pseudogene paralog CYP2D7 and thus is able to detect all star alleles, particularly those that contain structural variants, accurately. Please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2020.05.05.077966v2) for details about the method. ## Running the program This Python3 program can be run as follows: ```bash -star_caller.py --manifest MANIFEST_FILE \ - --genome [19/37/38] \ - --prefix OUTPUT_FILE_PREFIX \ - --outDir OUTPUT_DIRECTORY \ - --threads NUMBER_THREADS +python3 star_caller.py --manifest MANIFEST_FILE \ + --genome [19/37/38] \ + --prefix OUTPUT_FILE_PREFIX \ + --outDir OUTPUT_DIRECTORY \ + --threads NUMBER_THREADS ``` The manifest is a text file in which each line should list the absolute path to an input BAM/CRAM file. -For CRAM input, it’s suggested to provide the path to the reference fasta file with `--reference` in the command. -Additionally, there is an option `--knownFunction` to call only star alleles with known functions, as well as an option `--includeNewStar` to call all star alleles including the newly added, uncurated ones (\*115-\*139) in PharmVar. +For CRAM input, it’s suggested to provide the path to the reference fasta file with `--reference` in the command. ## Interpreting the output diff --git a/caller/call_cn.py b/caller/call_variants.py similarity index 72% rename from caller/call_cn.py rename to caller/call_variants.py index 5a1065e..f6ad11a 100644 --- a/caller/call_cn.py +++ b/caller/call_variants.py @@ -33,7 +33,15 @@ process_raw_call_gc, process_raw_call_denovo, ) -from depth_calling.haplotype import get_haplotypes_from_bam, extract_hap +from depth_calling.haplotype import ( + get_haplotypes_from_bam, + get_haplotypes_from_bam_single_region, + extract_hap, +) +from depth_calling.snp_count import ( + get_supporting_reads, + get_supporting_reads_single_region, +) INTRON1_BP_APPROX = 42130500 @@ -93,6 +101,11 @@ "g.42129042T>C", "g.42129174C>A", "g.42129180A>T", + "g.42127526C>T", + "g.42128325A>G", + "g.42126877G>A", + "g.42127973T>C", + "g.42127556T>C", ] @@ -197,7 +210,7 @@ def good_read(read): return read.is_secondary == 0 and read.is_supplementary == 0 -def get_allele_counts_42128936(bamfile_handle, genome): +def get_allele_counts_var42128936(bamfile_handle, genome): """ Search for the inserstions at 42128936 defining *30/*40/*58 in read sequences @@ -223,6 +236,23 @@ def get_allele_counts_42128936(bamfile_handle, genome): return (ref_read, long_ins_read, short_ins_read) +def update_var42128936( + var_list, var_alt, var_ref, ref_read, long_ins_read, short_ins_read +): + """ + Update variant read counts for g42128936. + """ + if "g.42128936-42128937insGGGGCGAAAGGGGCGAAA" in var_list: + long_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAAGGGGCGAAA") + var_alt[long_ins_index] = long_ins_read + var_ref[long_ins_index] = short_ins_read + ref_read + if "g.42128936-42128937insGGGGCGAAA" in var_list: + short_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAA") + var_alt[short_ins_index] = short_ins_read + var_ref[short_ins_index] = long_ins_read + ref_read + return var_alt, var_ref + + def call_exon9gc(d6_count, d7_count, full_length_cn): """ Call exon 9 conversion @@ -257,20 +287,25 @@ def call_exon9gc(d6_count, d7_count, full_length_cn): return None -def call_var42126938(bamfile, cnvtag, site42126938, base_db, target_positions): +def call_var42126938(bamfile, full_length_cn, base_db): """ - Call variant g.42126938C>T (gene conversion variant in homology region) + Call variant g.42126938C>T (gene conversion variant in homology region) based on read depth and phased haplotypes """ - dcn = {"star5": 3, "cn2": 4} - assert cnvtag in dcn - full_length_cn = dcn[cnvtag] - d6_cn = call_cn_snp(full_length_cn, [site42126938[0]], [site42126938[1]], 0.8)[0] var_called = [] # Whether g.42126938C>T is on the same haplotype as g.42126611C>G G_haplotype = False + snp_d6, snp_d7 = get_supporting_reads( + bamfile, base_db.dsnp1, base_db.dsnp2, base_db.nchr, base_db.dindex + ) + d6_d7_base_count = [snp_d6[-1], snp_d7[-1]] + d6_cn = call_cn_snp( + full_length_cn, [d6_d7_base_count[0]], [d6_d7_base_count[1]], 0.8 + )[0] if d6_cn is not None and d6_cn < full_length_cn - 2: - haplotype_per_read = get_haplotypes_from_bam(bamfile, base_db, target_positions) + haplotype_per_read = get_haplotypes_from_bam( + bamfile, base_db, range(len(base_db.dsnp1)) + ) recombinant_read_count = extract_hap(haplotype_per_read, [0, 2]) if "12" in recombinant_read_count and sum(recombinant_read_count["12"]) > 1: G_hap_count = extract_hap(haplotype_per_read, [1, 2]) @@ -278,7 +313,56 @@ def call_var42126938(bamfile, cnvtag, site42126938, base_db, target_positions): var_called.append("g.42126938C>T") if "12" in G_hap_count and sum(G_hap_count["12"]) > 1: G_haplotype = True - return var_called, G_haplotype + return d6_d7_base_count, var_called, G_haplotype + + +def call_var42127526_var42127556(bamfile, cnvtag, base_db): + """ + Call variant g.42127526C>T (gene conversion variant in homology region) + based on read depth and phased haplotypes + """ + var_called = [] + var_ref, var_alt, var_ref_forward, var_ref_reverse = get_supporting_reads_single_region( + bamfile, base_db.dsnp1, base_db.nchr, base_db.dindex + ) + var7526_count = [var_ref[0], var_alt[0]] + var7556_count = [var_ref[1], var_alt[1]] + if cnvtag in CNVTAG_LOOKUP_TABLE: + d6_cn = CNVTAG_LOOKUP_TABLE[cnvtag].exon9_to_intron1 + var7526_cn = call_cn_snp(d6_cn, [var7526_count[1]], [var7526_count[0]])[0] + var7556_cn = call_cn_snp(d6_cn, [var7556_count[1]], [var7556_count[0]])[0] + haplotype_per_read = get_haplotypes_from_bam_single_region( + bamfile, base_db, range(len(base_db.dsnp1)) + ) + recombinant_read_count = extract_hap(haplotype_per_read, [0, 1, 2]) + if "211" in recombinant_read_count and sum(recombinant_read_count["211"]) > 1: + for _ in range(var7526_cn): + var_called.append("g.42127526C>T") + elif "221" in recombinant_read_count and sum(recombinant_read_count["221"]) > 1: + for _ in range(min(var7526_cn, var7556_cn)): + var_called.append("g.42127526C>T") + var_called.append("g.42127556T>C") + return var7526_count, var7556_count, var_called + + +def call_var42127803hap(bamfile, cnvtag, base_db): + """ + Call haplotype with regard to g.42127803C>T and g.42127941G>A + """ + diff_haplotype = False + if cnvtag == "cn2": + haplotype_per_read = get_haplotypes_from_bam_single_region( + bamfile, base_db, range(len(base_db.dsnp1)) + ) + recombinant_read_count = extract_hap(haplotype_per_read, [0, 1]) + if ( + "12" in recombinant_read_count + and sum(recombinant_read_count["12"]) > 1 + and "21" in recombinant_read_count + and sum(recombinant_read_count["21"]) > 1 + ): + diff_haplotype = True + return diff_haplotype def get_called_variants(var_list, cn_prob_processed, starting_index=0): diff --git a/caller/cnv_hybrid.py b/caller/cnv_hybrid.py index 9588125..d8213f0 100644 --- a/caller/cnv_hybrid.py +++ b/caller/cnv_hybrid.py @@ -115,6 +115,11 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_ and exon9gc_call_stringent <= exon9_intron4_sites_consensus ): exon9region_sites_consensus = exon9gc_call_stringent + elif ( + exon9region_sites_consensus > exon9gc_call_stringent + and exon9gc_call_stringent >= exon9_intron4_sites_consensus + ): + exon9region_sites_consensus = exon9gc_call_stringent else: exon9region_sites = [ a diff --git a/caller/construct_star_table.py b/caller/construct_star_table.py index 17b15af..4e36b37 100644 --- a/caller/construct_star_table.py +++ b/caller/construct_star_table.py @@ -22,8 +22,8 @@ # Exon 9 gene conversion -EXON9GC_ALLELES = ["*36", "*4N", "*57", "*83"] -EXON9GC_PAIR_ALLELES = {"*36": "*10", "*4N": "*4A"} +EXON9GC_ALLELES = ["*36", "*4.013", "*57", "*83"] +EXON9GC_PAIR_ALLELES = {"*36": "*10", "*4.013": "*4"} def make_hap_dic(variant_list, star_set, hap_dic): @@ -51,7 +51,7 @@ def get_hap_table(hap_table): for line in f: at = line.strip().split() star_id = at[0] - variant_list = sorted(at[1:-2]) + variant_list = sorted(at[1:-1]) var_list_joined = "_".join(variant_list) dhap.setdefault(var_list_joined, star_id) dstar.setdefault(star_id, var_list_joined) diff --git a/caller/match_star_allele.py b/caller/match_star_allele.py index 9d9c1f1..911524c 100644 --- a/caller/match_star_allele.py +++ b/caller/match_star_allele.py @@ -20,6 +20,7 @@ import os from collections import namedtuple +import re CNVTAG_TO_GENOTYPE = { "star5_star5": "*5/*5", @@ -29,10 +30,10 @@ } # These suballeles below are not converted to main alleles as # they reflect SVs. -KEPT_SUBALLELES = ["*4N"] +KEPT_SUBALLELES = ["*4.013"] # Rare alleles lead to nonunique diplotypes. Select against these # when there are nonunique calls. -RARE_ALLELES = ["*34", "*39", "*4J"] +RARE_ALLELES = ["*34", "*39", "*4.009", "*139"] raw_star = namedtuple("raw_star", "call_info candidate star_call") @@ -45,21 +46,21 @@ def get_var_list(var_observed): return "_".join(var_observed) -def check_name(star_alleles): +def convert_to_main_allele(list_of_star): """ - Convert sub-alleles to main alleles + Convert suballeles to main alleles """ - sname = set() - for var in star_alleles: - lname = [] - for hap in var.split("_"): - if hap[-1].isalpha(): - lname.append(hap[:-1]) + converted_list = set() + for stars in list_of_star: + star_split = stars.split("_") + converted_star = [] + for star in star_split: + if star not in KEPT_SUBALLELES: + converted_star.append(star.split(".")[0]) else: - lname.append(hap) - lname = sorted(lname) - sname.add("_".join(lname)) - return sname + converted_star.append(star) + converted_list.add("_".join(sorted(converted_star))) + return list(converted_list) def get_star(var_observed, dic): @@ -83,21 +84,22 @@ def get_star(var_observed, dic): if "*" in dic[var_list]: match_tag = "unique_match" raw_stars = [dic[var_list]] - processed_stars = raw_stars + processed_stars = convert_to_main_allele(raw_stars) # More than one match elif len(dic[var_list]) > 1: raw_stars = dic[var_list] - main_allele_name = check_name(raw_stars) - if len(list(main_allele_name)) == 1: + processed_stars = convert_to_main_allele(raw_stars) + if len(processed_stars) == 1: match_tag = "unique_star" - processed_stars = list(main_allele_name) else: rare_stars_found = [] for haplotype in raw_stars: for rare_allele in RARE_ALLELES: if rare_allele in haplotype: rare_stars_found.append(haplotype) - processed_stars = [a for a in raw_stars if a not in rare_stars_found] + processed_stars = convert_to_main_allele( + [a for a in raw_stars if a not in rare_stars_found] + ) if len(processed_stars) == 1: match_tag = "pick_common_allele" else: @@ -106,7 +108,7 @@ def get_star(var_observed, dic): # Unique match match_tag = "unique_match" raw_stars = dic[var_list] - processed_stars = [raw_stars[0]] + processed_stars = convert_to_main_allele([raw_stars[0]]) return raw_star(match_tag, raw_stars, processed_stars) @@ -138,11 +140,8 @@ def call_star68(var_observed, cnvcall, dic): hap_list = [] for tag in matchtag: hap_list += tag[-1] - if len(hap_list) == 1: - return raw_star("unique_match", hap_list, hap_list) - main_allele_name = check_name(hap_list) - if len(list(main_allele_name)) == 1: - return raw_star("unique_star", hap_list, list(main_allele_name)) + if len(set(hap_list)) == 1: + return raw_star("unique_match", hap_list, [hap_list[0]]) rare_stars_found = [] for haplotype in hap_list: @@ -150,7 +149,7 @@ def call_star68(var_observed, cnvcall, dic): if rare_allele in haplotype: rare_stars_found.append(haplotype) processed_stars = [a for a in hap_list if a not in rare_stars_found] - if len(processed_stars) == 1: + if len(set(processed_stars)) == 1: return raw_star("pick_common_allele", hap_list, [processed_stars[0]]) # The one with g.42130692G>A removed is the most likely case @@ -196,22 +195,25 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn): Clean up final call to report diplotypes in *#/*# format whenever possible. """ # zero or more than one set + final_call = sorted(final_call) if len(final_call) == 2 and cnvcall == "cn2": diplotype1 = final_call[0].split("_") diplotype2 = final_call[1].split("_") return "/".join(diplotype1) + ";" + "/".join(diplotype2) if final_call == [] or len(final_call) > 1: + if final_call == ["*10_*10_*4.013", "*10_*36_*4"]: + return "*4/*36+*10" return ";".join(final_call) called_stars = final_call[0] if cnvcall == "star5_star68": - if called_stars == "*4A": - return "*5/*4A+*68" + if called_stars == "*4": + return "*5/*68+*4" return "*68/" + called_stars if cnvcall == "star13_star68": - if called_stars in ["*4A", "*4"]: - return "*13/*4A+*68" + if called_stars == "*4": + return "*13/*68+*4" return "*13_" + called_stars + "_*68" split_call = called_stars.split("_") @@ -228,7 +230,7 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn): if split_call.count("*2") >= 2: split_call.remove("*2") split_call.remove("*2") - return "*2+*13/" + split_call[0] + return "*13+*2/" + split_call[0] return None if cnvcall == "dup_star13": if spacer_cn is not None and spacer_cn == 1: @@ -270,23 +272,23 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn): # for these two cases, check spacer CN to determine if they are on the same chromosome if split_call == ["*10", "*10"]: if spacer_cn is not None and spacer_cn > 2: - return "*5/*10+*36" + return "*5/*36+*10" else: return None - if split_call == ["*4A", "*4A"]: + if split_call == ["*4", "*4"]: if spacer_cn is not None and spacer_cn > 2: - return "*5/*4A+*4N" + return "*5/*4.013+*4" return "/".join(split_call) if cnvcall == "exon9hyb": - if "*4A" in split_call and "*4N" in split_call: + if "*4" in split_call and "*4.013" in split_call: remain_index = [ n for n in range(3) - if n not in [split_call.index("*4A"), split_call.index("*4N")] + if n not in [split_call.index("*4"), split_call.index("*4.013")] ] assert len(remain_index) == 1 - return split_call[remain_index[0]] + "/*4A+*4N" + return split_call[remain_index[0]] + "/*4.013+*4" if "*10" in split_call and "*36" in split_call: remain_index = [ n @@ -294,7 +296,7 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn): if n not in [split_call.index("*10"), split_call.index("*36")] ] assert len(remain_index) == 1 - return split_call[remain_index[0]] + "/*10+*36" + return split_call[remain_index[0]] + "/*36+*10" if split_call.count("*36") == 2: remain_star = [a for a in split_call if a != "*36"] return "*36+*36/" + remain_star[0] @@ -303,36 +305,36 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn): var = [] if "*36" in split_call: var = [a for a in split_call if a not in ["*10", "*36"]] - elif "*4N" in split_call: - var = [a for a in split_call if a not in ["*4A", "*4N"]] + elif "*4.013" in split_call: + var = [a for a in split_call if a not in ["*4", "*4.013"]] if len(var) == 2: split_call.remove(var[0]) split_call.remove(var[1]) if len(set(var)) == 1: - return var[0] + "x2/" + "+".join(split_call) + return var[0] + "x2/" + "+".join(sorted(split_call, reverse=True)) else: - return "+".join(var) + "/" + "+".join(split_call) + return "+".join(var) + "/" + "+".join(sorted(split_call, reverse=True)) if var == []: if called_stars == "*10_*10_*10_*36": - return "*10x2/*10+*36" - if called_stars == "*4A_*4A_*4A_*4N": - return "*4Ax2/*4A+*4N" + return "*10x2/*36+*10" + if called_stars == "*4_*4_*4_*4.013": + return "*4x2/*4.013+*4" if ( cnvcall == "exon9hyb_exon9hyb" or cnvcall == "exon9hyb_exon9hyb_exon9hyb" or cnvcall == "exon9hyb_exon9hyb_exon9hyb_exon9hyb" ): - if called_stars == "*4A_*4A_*4N_*4N": - return "*4A+*4N/*4A+*4N" + if called_stars == "*4_*4_*4.013_*4.013": + return "*4.013+*4/*4.013+*4" if called_stars == "*10_*10_*36_*36": - return "*10+*36/*10+*36" + return "*36+*10/*36+*10" if called_stars == "*10_*36_*36_*36": - return "*10+*36/*36+*36" + return "*36+*10/*36+*36" if called_stars == "*10_*10_*36_*36_*36": - return "*10+*36/*10+*36+*36" + return "*36+*10/*36+*36+*10" if called_stars == "*10_*10_*36_*36_*36_*36": - return "*10+*36+*36/*10+*36+*36" + return "*36+*36+*10/*36+*36+*10" if ( cnvcall == "exon9hyb_exon9hyb_exon9hyb" and "*10" in split_call @@ -343,43 +345,44 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn): split_call.remove("*36") split_call.remove("*36") split_call.remove("*83") - return split_call[0] + "/*10+*36+*36+*83" + return split_call[0] + "/*36+*36+*83+*10" var = [a for a in split_call if a not in ["*10", "*36", "*83"]] if len(var) == 1: split_call.remove(var[0]) - return var[0] + "/" + "+".join(split_call) + return var[0] + "/" + "+".join(sorted(split_call, reverse=True)) if "star68" in cnvcall: cn = cnvcall.split("_").count("star68") if len(set(cnvcall.split("_"))) == 1: - if "*4A" in split_call: - var = [a for a in split_call if a != "*4A"] + if "*4" in split_call: + var = [a for a in split_call if a != "*4"] if len(var) == 1: - genotype = var[0] + "/*4A" + genotype = var[0] + "/" for _ in range(cn): - genotype += "+*68" + genotype += "*68+" + genotype += "*4" return genotype - elif split_call == ["*4A", "*4A"]: + elif split_call == ["*4", "*4"]: if cn == 2: - return "*4A+*68/*4A+*68" + return "*68+*4/*68+*4" elif cn == 3: - return "*4A+*68/*4A+*68+*68" + return "*68+*4/*68+*68+*4" elif cn == 4: - return "*4A+*68+*68/*4A+*68+*68" + return "*68+*68+*4/*68+*68+*4" if cnvcall == "star68": - if split_call[0] in ["*4", "*4A"]: - return split_call[1] + "/*4A+*68" - if split_call[1] in ["*4", "*4A"]: - return split_call[0] + "/*4A+*68" + if split_call[0] == "*4": + return split_call[1] + "/*68+*4" + if split_call[1] == "*4": + return split_call[0] + "/*68+*4" if cnvcall == "dup_star68": - var = [a for a in split_call if a not in ["*4A", "*68"]] + var = [a for a in split_call if a not in ["*4", "*68"]] if len(var) == 2 and len(set(var)) == 1: - return var[0] + "x2/*4A+*68" - if var == [] and called_stars == "*4A_*4A_*4A": - return "*4Ax2/*4A+*68" + return var[0] + "x2/*68+*4" + if var == [] and called_stars == "*4_*4_*4": + return "*4x2/*68+*4" if cnvcall == "exon9hyb_star68": - if called_stars == "*4A_*4A_*4N": - return "*4A+*4N/*4A+*68" + if called_stars == "*4_*4_*4.013": + return "*4.013+*4/*68+*4" for _ in range(cn): split_call.append("*68") return "_".join(split_call) @@ -387,26 +390,6 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn): return called_stars -def convert_to_main_allele(final_call): - """ - Convert suballeles to main alleles in final call - """ - if final_call is None: - return None - suballeles = [] - for index, element in enumerate(final_call): - if element.isalpha() and element != "x": - for i in range(4): - if final_call[index - i] == "*": - main_allele = final_call[index - i : index] - break - if main_allele + element not in KEPT_SUBALLELES: - suballeles.append(main_allele + element) - for suballele in suballeles: - final_call = final_call.replace(suballele, suballele[:-1]) - return final_call - - def update_variants(var_observed, cnvcall, exon9): """ Update variants based on called CNV. @@ -418,6 +401,13 @@ def update_variants(var_observed, cnvcall, exon9): ): var_observed.append("g.42129819G>T") + # g.42127556T>C is included in g.42127565T>C definition for *108. + if "g.42127565T>C" in var_observed: + for _ in range( + var_observed.count("g.42127565T>C") - var_observed.count("g.42127556T>C") + ): + var_observed.append("g.42127556T>C") + # g.42126611C>G is in the D6 part of the hybrid gene. if "star13" in cnvcall and "intron1" not in cnvcall: if "g.42126611C>G" in var_observed: @@ -432,16 +422,23 @@ def update_variants(var_observed, cnvcall, exon9): var_observed.append("g.42126611C>G") # Add these variants if they are not called to the sufficient copy number. # These variants belong to *10 or *4 - e9hyb_variant = [ - "g.42129754G>A", - "g.42130692G>A", - "g.42128945C>T", - "g.42129809T>C", - "g.42129819G>T", - ] - for var_to_add in e9hyb_variant: + for var_to_add in ["g.42130692G>A"]: if var_to_add in var_observed and var_observed.count(var_to_add) <= cn: var_observed.append(var_to_add) + for var_to_add in ["g.42129754G>A"]: + if ( + var_to_add in var_observed + and var_observed.count(var_to_add) <= cn + and "g.42128945C>T" not in var_observed + ): + var_observed.append(var_to_add) + for var_to_add in ["g.42128945C>T", "g.42129809T>C", "g.42129819G>T"]: + if ( + var_to_add in var_observed + and var_observed.count(var_to_add) <= cn + and "g.42129754G>A" not in var_observed + ): + var_observed.append(var_to_add) exon9_values = namedtuple( "exon9_values", "exon9_cn exon9cn_in_consensus exon9_raw_site1 exon9_raw_site2" @@ -471,7 +468,15 @@ def update_variants(var_observed, cnvcall, exon9): return var_observed -def match_star(var_observed, cnvcall, spacer_cn, star_combinations, exon9): +def match_star( + var_observed, + cnvcall, + spacer_cn, + star_combinations, + exon9, + var42126938_G_haplotype, + var42127803_diff_haplotype, +): """ Return the star allele call based on the called cnv/hybrid group and small variants """ @@ -508,7 +513,6 @@ def match_star(var_observed, cnvcall, spacer_cn, star_combinations, exon9): matchtag_new = matched_calls[0] final_call = matchtag_new.star_call final_call_clean = get_final_call_clean(final_call, cnvcall, spacer_cn) - final_call_clean = convert_to_main_allele(final_call_clean) call_info = matchtag_new.call_info raw_call = matchtag_new.candidate return star_call( @@ -517,8 +521,28 @@ def match_star(var_observed, cnvcall, spacer_cn, star_combinations, exon9): final_call = matchtag.star_call final_call_clean = get_final_call_clean(final_call, cnvcall, spacer_cn) - final_call_clean = convert_to_main_allele(final_call_clean) call_info = matchtag.call_info + if call_info == "more_than_one_match" and cnvcall == "cn2": + if sorted(re.split(r"[;/]+", final_call_clean)) == [ + "*1", + "*27", + "*32", + "*41", + ]: + if var42126938_G_haplotype: + final_call_clean = "*1/*32" + else: + final_call_clean = "*27/*41" + if sorted(re.split(r"[;/]+", final_call_clean)) == [ + "*1", + "*119", + "*2", + "*41", + ]: + if var42127803_diff_haplotype: + final_call_clean = "*119/*2" + else: + final_call_clean = "*1/*41" raw_call = matchtag.candidate return star_call(call_info, " ".join(var_observed), raw_call, final_call_clean) @@ -527,7 +551,6 @@ def match_star(var_observed, cnvcall, spacer_cn, star_combinations, exon9): matchtag = call_star68(var_observed, cnvcall, dic) final_call = matchtag.star_call final_call_clean = get_final_call_clean(final_call, cnvcall, spacer_cn) - final_call_clean = convert_to_main_allele(final_call_clean) call_info = matchtag.call_info raw_call = matchtag.candidate return star_call( diff --git a/caller/tests/test_call_cn.py b/caller/tests/test_call_variants.py similarity index 93% rename from caller/tests/test_call_cn.py rename to caller/tests/test_call_variants.py index 55dedb9..1925037 100644 --- a/caller/tests/test_call_cn.py +++ b/caller/tests/test_call_variants.py @@ -24,10 +24,10 @@ import pysam -from ..call_cn import ( +from ..call_variants import ( process_raw_call_gc, process_raw_call_denovo, - get_allele_counts_42128936, + get_allele_counts_var42128936, call_exon9gc, get_called_variants, ) @@ -40,7 +40,9 @@ class TestCallCN(object): def test_call_42128936(self): bam = pysam.AlignmentFile(os.path.join(test_data_dir, "NA23275.bam"), "rb") - ref_read, long_ins_read, short_ins_read = get_allele_counts_42128936(bam, "37") + ref_read, long_ins_read, short_ins_read = get_allele_counts_var42128936( + bam, "37" + ) assert long_ins_read == 6 def test_get_called_variants(self): diff --git a/caller/tests/test_match_star.py b/caller/tests/test_match_star.py index b97fc2a..e361cd4 100644 --- a/caller/tests/test_match_star.py +++ b/caller/tests/test_match_star.py @@ -25,7 +25,7 @@ from ..construct_star_table import get_hap_table from ..match_star_allele import ( - check_name, + convert_to_main_allele, get_final_call_clean, CNVTAG_TO_GENOTYPE, get_dic, @@ -40,7 +40,6 @@ def test_accepted_cnv(self): star_table = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", - "full_star_table", "star_table.txt", ) star_combinations = get_hap_table(star_table) @@ -51,22 +50,22 @@ def test_accepted_cnv(self): def test_check_name(self): var_called = ["*1_*2"] - main_allele = check_name(var_called) - assert list(main_allele) == ["*1_*2"] + main_allele = convert_to_main_allele(var_called) + assert main_allele == ["*1_*2"] - var_called = ["*1_*4A"] - main_allele = check_name(var_called) - assert list(main_allele) == ["*1_*4"] + var_called = ["*1_*4"] + main_allele = convert_to_main_allele(var_called) + assert main_allele == ["*1_*4"] - var_called = ["*1_*4A", "*1_*2"] - main_allele = check_name(var_called) + var_called = ["*1_*4", "*1_*2"] + main_allele = convert_to_main_allele(var_called) assert len(main_allele) == 2 assert "*1_*2" in main_allele assert "*1_*4" in main_allele - var_called = ["*1_*4A", "*1_*4D"] - main_allele = check_name(var_called) - assert list(main_allele) == ["*1_*4"] + var_called = ["*1_*4", "*1_*4.009"] + main_allele = convert_to_main_allele(var_called) + assert main_allele == ["*1_*4"] def test_clean_call(self): cnvcall = "star5" @@ -104,12 +103,12 @@ def test_clean_call(self): spacer_cn = 3 final_call = ["*10_*10"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*5/*10+*36" + assert clean_call == "*5/*36+*10" spacer_cn = 4 final_call = ["*10_*10"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*5/*10+*36" + assert clean_call == "*5/*36+*10" cnvcall = "star13" spacer_cn = None @@ -129,7 +128,7 @@ def test_clean_call(self): cnvcall = "dup_star13intron1" final_call = ["*1_*2_*2"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*2+*13/*1" + assert clean_call == "*13+*2/*1" cnvcall = "dup_star13" final_call = ["*1_*2"] @@ -169,74 +168,74 @@ def test_clean_call(self): cnvcall = "exon9hyb" final_call = ["*10_*10_*36"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*10/*10+*36" + assert clean_call == "*10/*36+*10" - final_call = ["*1_*4A_*4N"] + final_call = ["*1_*4_*4.013"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*1/*4A+*4N" + assert clean_call == "*1/*4.013+*4" cnvcall = "exon9hyb_exon9hyb" final_call = ["*10_*10_*36_*36"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*10+*36/*10+*36" + assert clean_call == "*36+*10/*36+*10" final_call = ["*1_*10_*36_*36"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*1/*10+*36+*36" + assert clean_call == "*1/*36+*36+*10" cnvcall = "exon9hyb_exon9hyb_exon9hyb" final_call = ["*10_*10_*36_*36_*36"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*10+*36/*10+*36+*36" + assert clean_call == "*36+*10/*36+*36+*10" final_call = ["*1_*10_*36_*36_*36"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*1/*10+*36+*36+*36" + assert clean_call == "*1/*36+*36+*36+*10" cnvcall = "exon9hyb_exon9hyb_exon9hyb_exon9hyb" final_call = ["*10_*10_*36_*36_*36_*36"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*10+*36+*36/*10+*36+*36" + assert clean_call == "*36+*36+*10/*36+*36+*10" final_call = ["*1_*10_*36_*36_*36_*36"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*1/*10+*36+*36+*36+*36" + assert clean_call == "*1/*36+*36+*36+*36+*10" cnvcall = "star5_star68" - final_call = ["*4A"] + final_call = ["*4"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*5/*4A+*68" + assert clean_call == "*5/*68+*4" final_call = ["*10"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) assert clean_call == "*68/*10" cnvcall = "star68" - final_call = ["*4A_*40"] + final_call = ["*4_*40"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*40/*4A+*68" + assert clean_call == "*40/*68+*4" final_call = ["*10_*40"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) assert clean_call == "*10_*40_*68" cnvcall = "star68_star68" - final_call = ["*4A_*4A"] + final_call = ["*4_*4"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*4A+*68/*4A+*68" + assert clean_call == "*68+*4/*68+*4" final_call = ["*10_*40"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) assert clean_call == "*10_*40_*68_*68" cnvcall = "star68_star68_star68" - final_call = ["*4A_*4A"] + final_call = ["*4_*4"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*4A+*68/*4A+*68+*68" + assert clean_call == "*68+*4/*68+*68+*4" - final_call = ["*2A_*4A"] + final_call = ["*2_*4"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) - assert clean_call == "*2A/*4A+*68+*68+*68" + assert clean_call == "*2/*68+*68+*68+*4" final_call = ["*10_*40"] clean_call = get_final_call_clean(final_call, cnvcall, spacer_cn) diff --git a/data/CYP2D6_SNP_19.txt b/data/CYP2D6_SNP_19.txt index 39037b1..a9d9827 100644 --- a/data/CYP2D6_SNP_19.txt +++ b/data/CYP2D6_SNP_19.txt @@ -6,10 +6,8 @@ chr22 42521951 a 42534107 g downstream_exon9 chr22 42521994 a 42535713 g downstream_exon9 chr22 42522027 c 42535746 t downstream_exon9 chr22 42522106 g 42535818 a downstream_exon9 -chr22 42522613 GTCACCAGGAAAGCAAA,CTCACCAGGAAAGCAAA 42536326 GTCACCAGAAAGCTGAC exon9 +chr22 42522613 GTCACCAGGAAAGCAA,CTCACCAGGAAAGCAA 42536326 GTCACCAGAAAGCTGA exon9 chr22 42522660 AGTGGGCACC 42536373 GGCGGCCACG exon9 -chr22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA g.42126938_haplotype -chr22 42522940 c 42536653 t g.42126938C>T chr22 42523636 c 42537349 a exon7 chr22 42523709 a 42537409 g intron6 chr22 42523720 g 42537420 a intron6 diff --git a/data/CYP2D6_SNP_37.txt b/data/CYP2D6_SNP_37.txt index eb10dfe..5b33506 100644 --- a/data/CYP2D6_SNP_37.txt +++ b/data/CYP2D6_SNP_37.txt @@ -6,10 +6,8 @@ 22 42521994 a 42535713 g downstream_exon9 22 42522027 c 42535746 t downstream_exon9 22 42522106 g 42535818 a downstream_exon9 -22 42522613 GTCACCAGGAAAGCAAA,CTCACCAGGAAAGCAAA 42536326 GTCACCAGAAAGCTGAC exon9 +22 42522613 GTCACCAGGAAAGCAA,CTCACCAGGAAAGCAA 42536326 GTCACCAGAAAGCTGA exon9 22 42522660 AGTGGGCACC 42536373 GGCGGCCACG exon9 -22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA g.42126938_haplotype -22 42522940 c 42536653 t g.42126938C>T 22 42523636 c 42537349 a exon7 22 42523709 a 42537409 g intron6 22 42523720 g 42537420 a intron6 diff --git a/data/CYP2D6_SNP_38.txt b/data/CYP2D6_SNP_38.txt index 3ec615e..c99b91e 100644 --- a/data/CYP2D6_SNP_38.txt +++ b/data/CYP2D6_SNP_38.txt @@ -6,10 +6,8 @@ chr22 42125946 a 42138098 g downstream_exon9 chr22 42125989 a 42139702 g downstream_exon9 chr22 42126022 c 42139735 t downstream_exon9 chr22 42126101 g 42139807 a downstream_exon9 -chr22 42126611 GTCACCAGGAAAGCAAA,CTCACCAGGAAAGCAAA 42140315 GTCACCAGAAAGCTGAC exon9 +chr22 42126611 GTCACCAGGAAAGCAA,CTCACCAGGAAAGCAA 42140315 GTCACCAGAAAGCTGA exon9 chr22 42126658 AGTGGGCACC 42140362 GGCGGCCACG exon9 -chr22 42126611 GTCACCAGGAAA 42140315 CTCACCAGGAAA g.42126938_haplotype -chr22 42126938 c 42140642 t g.42126938C>T chr22 42127634 c 42141339 a exon7 chr22 42127707 a 42141399 g intron6 chr22 42127718 g 42141410 a intron6 diff --git a/data/CYP2D6_haplotype_19.txt b/data/CYP2D6_haplotype_19.txt new file mode 100644 index 0000000..5ffab0c --- /dev/null +++ b/data/CYP2D6_haplotype_19.txt @@ -0,0 +1,9 @@ +#chr pos_CYP2D6 CYP2D6_base pos_CYP2D7 CYP2D7_or_variant_base annotation +chr22 42522660 AGTGGGCACC 42536373 GGCGGCCACG g.42126938C>T +chr22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA g.42126938C>T +chr22 42522940 c 42536653 t g.42126938C>T +chr22 42523805 C 42537505 T g.42127803C>T +chr22 42523943 G 42537643 A g.42127803C>T +chr22 42523528 C 42537241 T g.42127526C>T_g.42127556T>C +chr22 42523558 T 42537271 C g.42127526C>T_g.42127556T>C +chr22 42523813 g 42537513 a g.42127526C>T_g.42127556T>C diff --git a/data/CYP2D6_haplotype_37.txt b/data/CYP2D6_haplotype_37.txt new file mode 100644 index 0000000..0a027fb --- /dev/null +++ b/data/CYP2D6_haplotype_37.txt @@ -0,0 +1,9 @@ +#chr pos_CYP2D6 CYP2D6_base pos_CYP2D7 CYP2D7_or_variant_base annotation +22 42522660 AGTGGGCACC 42536373 GGCGGCCACG g.42126938C>T +22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA g.42126938C>T +22 42522940 c 42536653 t g.42126938C>T +22 42523805 C 42537505 T g.42127803C>T +22 42523943 G 42537643 A g.42127803C>T +22 42523528 C 42537241 T g.42127526C>T_g.42127556T>C +22 42523558 T 42537271 C g.42127526C>T_g.42127556T>C +22 42523813 g 42537513 a g.42127526C>T_g.42127556T>C diff --git a/data/CYP2D6_haplotype_38.txt b/data/CYP2D6_haplotype_38.txt new file mode 100644 index 0000000..0a65b4f --- /dev/null +++ b/data/CYP2D6_haplotype_38.txt @@ -0,0 +1,9 @@ +#chr pos_CYP2D6 CYP2D6_base pos_CYP2D7 CYP2D7_or_variant_base annotation +chr22 42126658 AGTGGGCACC 42140362 GGCGGCCACG g.42126938C>T +chr22 42126611 GTCACCAGGAAA 42140315 CTCACCAGGAAA g.42126938C>T +chr22 42126938 c 42140642 t g.42126938C>T +chr22 42127803 C 42141495 T g.42127803C>T +chr22 42127941 G 42141633 A g.42127803C>T +chr22 42127526 C 42141231 T g.42127526C>T_g.42127556T>C +chr22 42127556 T 42141261 C g.42127526C>T_g.42127556T>C +chr22 42127811 g 42141503 a g.42127526C>T_g.42127556T>C diff --git a/data/include_new_star_table/CYP2D6_target_variant_19.txt b/data/CYP2D6_target_variant_19.txt similarity index 96% rename from data/include_new_star_table/CYP2D6_target_variant_19.txt rename to data/CYP2D6_target_variant_19.txt index 9a9dd32..55769c8 100644 --- a/data/include_new_star_table/CYP2D6_target_variant_19.txt +++ b/data/CYP2D6_target_variant_19.txt @@ -1,7 +1,10 @@ #chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name chr22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA gene_conversion g.42126611C>G +chr22 42522879 A 42536592 G snp g.42126877G>A +chr22 42522928 A 42536641 G snp g.42126926G>A chr22 42523459 T 42537172 C snp g.42127457C>T chr22 42523475 T 42537188 C snp g.42127473C>T +chr22 42523505 CCAGGGGGAT 42537218 CCAGGGGGAC snp g.42127512C>T chr22 42523516 G 42537229 A snp g.42127514A>G chr22 42523525 G 42537238 A snp g.42127523A>G chr22 42523535 CACC 42537248 CC indel g.42127533-42127534insAC @@ -40,6 +43,7 @@ chr22 42524243 CGT 42537945 CTGT indel g.42128242delT chr22 42524250 CAGCA 42537952 CAGTT indel g.42128249delAGTT chr22 42524274 G 42537976 T snp g.42128272T>G chr22 42524310 A 42538012 C snp g.42128308C>A +chr22 42524323 GTGCG 42538025 ATGCA snp g.42128325A>G chr22 42524797 AC 42538491 AGC indel g.42128796delG chr22 42524816 TCCCCT 42538510 TCCCT indel g.42128817-42128818insC chr22 42524817 T 42538511 C snp g.42128815C>T diff --git a/data/include_new_star_table/CYP2D6_target_variant_37.txt b/data/CYP2D6_target_variant_37.txt similarity index 96% rename from data/include_new_star_table/CYP2D6_target_variant_37.txt rename to data/CYP2D6_target_variant_37.txt index 4915ac2..6eeb8b5 100644 --- a/data/include_new_star_table/CYP2D6_target_variant_37.txt +++ b/data/CYP2D6_target_variant_37.txt @@ -1,7 +1,10 @@ #chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name 22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA gene_conversion g.42126611C>G +22 42522879 A 42536592 G snp g.42126877G>A +22 42522928 A 42536641 G snp g.42126926G>A 22 42523459 T 42537172 C snp g.42127457C>T 22 42523475 T 42537188 C snp g.42127473C>T +22 42523505 CCAGGGGGAT 42537218 CCAGGGGGAC snp g.42127512C>T 22 42523516 G 42537229 A snp g.42127514A>G 22 42523525 G 42537238 A snp g.42127523A>G 22 42523535 CACC 42537248 CC indel g.42127533-42127534insAC @@ -40,6 +43,7 @@ 22 42524250 CAGCA 42537952 CAGTT indel g.42128249delAGTT 22 42524274 G 42537976 T snp g.42128272T>G 22 42524310 A 42538012 C snp g.42128308C>A +22 42524323 GTGCG 42538025 ATGCA snp g.42128325A>G 22 42524797 AC 42538491 AGC indel g.42128796delG 22 42524816 TCCCCT 42538510 TCCCT indel g.42128817-42128818insC 22 42524817 T 42538511 C snp g.42128815C>T diff --git a/data/include_new_star_table/CYP2D6_target_variant_38.txt b/data/CYP2D6_target_variant_38.txt similarity index 96% rename from data/include_new_star_table/CYP2D6_target_variant_38.txt rename to data/CYP2D6_target_variant_38.txt index 4e2c06d..d7e646d 100644 --- a/data/include_new_star_table/CYP2D6_target_variant_38.txt +++ b/data/CYP2D6_target_variant_38.txt @@ -1,7 +1,10 @@ #chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name chr22 42126611 GTCACCAGGAAA 42140315 CTCACCAGGAAA gene_conversion g.42126611C>G +chr22 42126877 A 42140581 G snp g.42126877G>A +chr22 42126926 A 42140630 G snp g.42126926G>A chr22 42127457 T 42141162 C snp g.42127457C>T chr22 42127473 T 42141178 C snp g.42127473C>T +chr22 42127503 CCAGGGGGAT 42141208 CCAGGGGGAC snp g.42127512C>T chr22 42127514 G 42141219 A snp g.42127514A>G chr22 42127523 G 42141228 A snp g.42127523A>G chr22 42127533 CACC 42141238 CC indel g.42127533-42127534insAC @@ -40,6 +43,7 @@ chr22 42128241 CGT 42141935 CTGT indel g.42128242delT chr22 42128248 CAGCA 42141942 CAGTT indel g.42128249delAGTT chr22 42128272 G 42141966 T snp g.42128272T>G chr22 42128308 A 42142002 C snp g.42128308C>A +chr22 42128321 GTGCG 42142015 ATGCA snp g.42128325A>G chr22 42128795 AC 42142490 AGC indel g.42128796delG chr22 42128814 TCCCCT 42142509 TCCCT indel g.42128817-42128818insC chr22 42128815 T 42142510 C snp g.42128815C>T diff --git a/data/include_new_star_table/CYP2D6_target_variant_homology_region_19.txt b/data/CYP2D6_target_variant_homology_region_19.txt similarity index 94% rename from data/include_new_star_table/CYP2D6_target_variant_homology_region_19.txt rename to data/CYP2D6_target_variant_homology_region_19.txt index 353c384..b73293b 100644 --- a/data/include_new_star_table/CYP2D6_target_variant_homology_region_19.txt +++ b/data/CYP2D6_target_variant_homology_region_19.txt @@ -17,7 +17,7 @@ chr22 42522906 G 42536619 A snp g.42126904A>G chr22 42522916 G 42536629 c,t snp_multi g.42126914C>G chr22 42522916 T 42536629 c,g snp_multi g.42126914C>T chr22 42522958 G 42536671 T snp g.42126956T>G -chr22 42522982 TGAGAGT 42536695 TGAGT snp g.42126981_42126982insGA +chr22 42522982 TGAGAGT 42536695 TGAGT snp g.42126981-42126982insGA chr22 42525889 G 42539560 A snp g.42129887A>G chr22 42525908 A 42539579 G snp g.42129906G>A chr22 42525912 G 42539583 C snp g.42129910C>G diff --git a/data/include_new_star_table/CYP2D6_target_variant_homology_region_37.txt b/data/CYP2D6_target_variant_homology_region_37.txt similarity index 94% rename from data/include_new_star_table/CYP2D6_target_variant_homology_region_37.txt rename to data/CYP2D6_target_variant_homology_region_37.txt index f4f4b97..9d3e7f8 100644 --- a/data/include_new_star_table/CYP2D6_target_variant_homology_region_37.txt +++ b/data/CYP2D6_target_variant_homology_region_37.txt @@ -17,7 +17,7 @@ 22 42522916 G 42536629 c,t snp_multi g.42126914C>G 22 42522916 T 42536629 c,g snp_multi g.42126914C>T 22 42522958 G 42536671 T snp g.42126956T>G -22 42522982 TGAGAGT 42536695 TGAGT snp g.42126981_42126982insGA +22 42522982 TGAGAGT 42536695 TGAGT snp g.42126981-42126982insGA 22 42525889 G 42539560 A snp g.42129887A>G 22 42525908 A 42539579 G snp g.42129906G>A 22 42525912 G 42539583 C snp g.42129910C>G diff --git a/data/include_new_star_table/CYP2D6_target_variant_homology_region_38.txt b/data/CYP2D6_target_variant_homology_region_38.txt similarity index 94% rename from data/include_new_star_table/CYP2D6_target_variant_homology_region_38.txt rename to data/CYP2D6_target_variant_homology_region_38.txt index 08408bf..f25339f 100644 --- a/data/include_new_star_table/CYP2D6_target_variant_homology_region_38.txt +++ b/data/CYP2D6_target_variant_homology_region_38.txt @@ -17,7 +17,7 @@ chr22 42126904 G 42140608 A snp g.42126904A>G chr22 42126914 G 42140618 c,t snp_multi g.42126914C>G chr22 42126914 T 42140618 c,g snp_multi g.42126914C>T chr22 42126956 G 42140660 T snp g.42126956T>G -chr22 42126980 TGAGAGT 42140684 TGAGT snp g.42126981_42126982insGA +chr22 42126980 TGAGAGT 42140684 TGAGT snp g.42126981-42126982insGA chr22 42129887 G 42143559 A snp g.42129887A>G chr22 42129906 A 42143578 G snp g.42129906G>A chr22 42129910 G 42143582 C snp g.42129910C>G diff --git a/data/full_star_table/CYP2D6_target_variant_19.txt b/data/full_star_table/CYP2D6_target_variant_19.txt deleted file mode 100644 index 1e3119a..0000000 --- a/data/full_star_table/CYP2D6_target_variant_19.txt +++ /dev/null @@ -1,80 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA gene_conversion g.42126611C>G -chr22 42523459 T 42537172 C snp g.42127457C>T -chr22 42523475 T 42537188 C snp g.42127473C>T -chr22 42523516 G 42537229 A snp g.42127514A>G -chr22 42523525 G 42537238 A snp g.42127523A>G -chr22 42523535 CACC 42537248 CC indel g.42127533-42127534insAC -chr22 42523539 ATGAATCACGGCAGTGGTGCAGGGCATGC 42537252 ATGAATCACGGCAGTGGTGTAGGGCATGT,ATGAATCACGGCAGTGGTGTAGGGCATGC,ATGAATCACGGCAGTGGTGCAGGGCATGT snp g.42127565T>C -chr22 42523592 A 42537305 G snp g.42127590G>A -chr22 42523595 C 42537308 G snp g.42127593G>C -chr22 42523610 T 42537323 C snp g.42127608C>T -chr22 42523612 C 42537325 T snp g.42127610T>C -chr22 42523621 G 42537334 T snp g.42127619T>G -chr22 42523805 T 42537505 C snp g.42127803C>T -chr22 42523843 G 42537543 C snp g.42127841C>G -chr22 42523847 GAT 42537547 GCAC indel g.42127846delCACATCCGGATGTAGGATC -chr22 42523854 T 42537554 C snp g.42127852C>T -chr22 42523855 A 42537555 G gene_conversion g.42127853G>A -chr22 42523858 G 42537558 T snp g.42127856T>G -chr22 42523940 G 42537640 T snp g.42127938T>G -chr22 42523943 A 42537643 G gene_conversion g.42127941G>A -chr22 42523964 TC 42537664 TGC indel g.42127963delG -chr22 42524175 CCTCCA 42537877 CCTTCTCC indel g.42128174delCTT -chr22 42524183 T 42537885 A gene_conversion g.42128181A>T -chr22 42524187 T 42537889 C gene_conversion g.42128185C>T -chr22 42524202 CAGGT 42537904 CAGTCA indel g.42128201delAGTC -chr22 42524213 CGGGGGGGGC 42537915 CGGGGGGGC,CGGGGTGGC indel g.42128218-42128219insG -chr22 42524214 A 42537916 G snp g.42128212G>A -chr22 42524217 A 42537919 G snp g.42128215G>A -chr22 42524219 T 42537921 G snp g.42128217G>T -chr22 42524237 A 42537939 G snp g.42128235G>A -chr22 42524243 CGT 42537945 CTGT indel g.42128242delT -chr22 42524250 CAGCA 42537952 CAGTT indel g.42128249delAGTT -chr22 42524274 G 42537976 T snp g.42128272T>G -chr22 42524310 A 42538012 C snp g.42128308C>A -chr22 42524797 AC 42538491 AGC indel g.42128796delG -chr22 42524816 TCCCCT 42538510 TCCCT indel g.42128817-42128818insC -chr22 42524817 T 42538511 C snp g.42128815C>T -chr22 42524850 T 42538544 C snp g.42128848C>T -chr22 42524905 TTAC 42538599 TC indel g.42128903-42128904insTA -chr22 42524929 tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt 42538623 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAAGGGGCGAAA -chr22 42524929 tGGGGCGAAAGGGGCGAAAGGGGCGt 42538623 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAA -chr22 42524935 A 42538629 G snp g.42128933G>A -chr22 42524947 T 42538641 C snp g.42128945C>T -chr22 42525035 A 42538729 c,t snp_multi g.42129033C>A -chr22 42525035 T 42538729 c,a snp_multi g.42129033C>T -chr22 42525044 C 42538738 T gene_conversion g.42129042T>C -chr22 42525058 G 42538752 C snp g.42129056C>G -chr22 42525073 A 42538767 t,g snp_multi g.42129071T>A -chr22 42525073 G 42538767 t,a snp_multi g.42129071T>G -chr22 42525077 T 42538771 C snp g.42129075C>T -chr22 42525085 CCT 42538779 CACT indel g.42129084delA -chr22 42525089 C 42538783 G snp g.42129087G>C -chr22 42525100 C 42538794 T snp g.42129098T>C -chr22 42525115 G 42538809 A snp g.42129113A>G -chr22 42525134 T 42538828 C snp g.42129132C>T -chr22 42525176 A 42538870 C gene_conversion g.42129174C>A -chr22 42525182 T 42538876 A snp g.42129180A>T -chr22 42525185 T 42538879 C snp g.42129183C>T -chr22 42525756 A 42539427 G snp g.42129754G>A -chr22 42525761 T 42539432 C snp g.42129759C>T -chr22 42525767 CCTGGTA 42539438 TCTGGGT,TCTGGAT exon2_conversion g.42129765T>C -chr22 42525772 A 42539443 g,t snp_multi g.42129770G>A -chr22 42525781 G 42539452 A exon2_conversion g.42129779A>G -chr22 42525782 T 42539453 C snp g.42129780C>T -chr22 42525811 C 42539482 T gene_conversion g.42129809T>C -chr22 42525821 T 42539492 G gene_conversion g.42129819G>T -chr22 42525823 A 42539494 G snp g.42129821G>A -chr22 42525829 G 42539500 C snp g.42129827C>G -chr22 42525838 A 42539509 G snp g.42129836G>A -chr22 42526656 CAAG 42540327 CAG gene_conversion g.42130655-42130656insA -chr22 42526669 T 42540341 C snp g.42130667C>T -chr22 42526670 T 42540342 C snp g.42130668C>T -chr22 42526694 A 42540366 G snp g.42130692G>A -chr22 42526712 A 42540384 G snp g.42130710G>A -chr22 42526717 T 42540389 C gene_conversion g.42130715C>T -chr22 42526721 A 42540393 G snp g.42130719G>A -chr22 42526763 T 42540435 C gene_conversion g.42130761C>T -chr22 42526775 T 42540447 C snp g.42130773C>T -chr22 42526780 A 42540452 G snp g.42130778G>A diff --git a/data/full_star_table/CYP2D6_target_variant_37.txt b/data/full_star_table/CYP2D6_target_variant_37.txt deleted file mode 100644 index 46e54a9..0000000 --- a/data/full_star_table/CYP2D6_target_variant_37.txt +++ /dev/null @@ -1,80 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA gene_conversion g.42126611C>G -22 42523459 T 42537172 C snp g.42127457C>T -22 42523475 T 42537188 C snp g.42127473C>T -22 42523516 G 42537229 A snp g.42127514A>G -22 42523525 G 42537238 A snp g.42127523A>G -22 42523535 CACC 42537248 CC indel g.42127533-42127534insAC -22 42523539 ATGAATCACGGCAGTGGTGCAGGGCATGC 42537252 ATGAATCACGGCAGTGGTGTAGGGCATGT,ATGAATCACGGCAGTGGTGTAGGGCATGC,ATGAATCACGGCAGTGGTGCAGGGCATGT snp g.42127565T>C -22 42523592 A 42537305 G snp g.42127590G>A -22 42523595 C 42537308 G snp g.42127593G>C -22 42523610 T 42537323 C snp g.42127608C>T -22 42523612 C 42537325 T snp g.42127610T>C -22 42523621 G 42537334 T snp g.42127619T>G -22 42523805 T 42537505 C snp g.42127803C>T -22 42523843 G 42537543 C snp g.42127841C>G -22 42523847 GAT 42537547 GCAC indel g.42127846delCACATCCGGATGTAGGATC -22 42523854 T 42537554 C snp g.42127852C>T -22 42523855 A 42537555 G gene_conversion g.42127853G>A -22 42523858 G 42537558 T snp g.42127856T>G -22 42523940 G 42537640 T snp g.42127938T>G -22 42523943 A 42537643 G gene_conversion g.42127941G>A -22 42523964 TC 42537664 TGC indel g.42127963delG -22 42524175 CCTCCA 42537877 CCTTCTCC indel g.42128174delCTT -22 42524183 T 42537885 A gene_conversion g.42128181A>T -22 42524187 T 42537889 C gene_conversion g.42128185C>T -22 42524202 CAGGT 42537904 CAGTCA indel g.42128201delAGTC -22 42524213 CGGGGGGGGC 42537915 CGGGGGGGC,CGGGGTGGC indel g.42128218-42128219insG -22 42524214 A 42537916 G snp g.42128212G>A -22 42524217 A 42537919 G snp g.42128215G>A -22 42524219 T 42537921 G snp g.42128217G>T -22 42524237 A 42537939 G snp g.42128235G>A -22 42524243 CGT 42537945 CTGT indel g.42128242delT -22 42524250 CAGCA 42537952 CAGTT indel g.42128249delAGTT -22 42524274 G 42537976 T snp g.42128272T>G -22 42524310 A 42538012 C snp g.42128308C>A -22 42524797 AC 42538491 AGC indel g.42128796delG -22 42524816 TCCCCT 42538510 TCCCT indel g.42128817-42128818insC -22 42524817 T 42538511 C snp g.42128815C>T -22 42524850 T 42538544 C snp g.42128848C>T -22 42524905 TTAC 42538599 TC indel g.42128903-42128904insTA -22 42524929 tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt 42538623 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAAGGGGCGAAA -22 42524929 tGGGGCGAAAGGGGCGAAAGGGGCGt 42538623 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAA -22 42524935 A 42538629 G snp g.42128933G>A -22 42524947 T 42538641 C snp g.42128945C>T -22 42525035 A 42538729 c,t snp_multi g.42129033C>A -22 42525035 T 42538729 c,a snp_multi g.42129033C>T -22 42525044 C 42538738 T gene_conversion g.42129042T>C -22 42525058 G 42538752 C snp g.42129056C>G -22 42525073 A 42538767 t,g snp_multi g.42129071T>A -22 42525073 G 42538767 t,a snp_multi g.42129071T>G -22 42525077 T 42538771 C snp g.42129075C>T -22 42525085 CCT 42538779 CACT indel g.42129084delA -22 42525089 C 42538783 G snp g.42129087G>C -22 42525100 C 42538794 T snp g.42129098T>C -22 42525115 G 42538809 A snp g.42129113A>G -22 42525134 T 42538828 C snp g.42129132C>T -22 42525176 A 42538870 C gene_conversion g.42129174C>A -22 42525182 T 42538876 A snp g.42129180A>T -22 42525185 T 42538879 C snp g.42129183C>T -22 42525756 A 42539427 G snp g.42129754G>A -22 42525761 T 42539432 C snp g.42129759C>T -22 42525767 CCTGGTA 42539438 TCTGGGT,TCTGGAT exon2_conversion g.42129765T>C -22 42525772 A 42539443 g,t snp_multi g.42129770G>A -22 42525781 G 42539452 A exon2_conversion g.42129779A>G -22 42525782 T 42539453 C snp g.42129780C>T -22 42525811 C 42539482 T gene_conversion g.42129809T>C -22 42525821 T 42539492 G gene_conversion g.42129819G>T -22 42525823 A 42539494 G snp g.42129821G>A -22 42525829 G 42539500 C snp g.42129827C>G -22 42525838 A 42539509 G snp g.42129836G>A -22 42526656 CAAG 42540327 CAG gene_conversion g.42130655-42130656insA -22 42526669 T 42540341 C snp g.42130667C>T -22 42526670 T 42540342 C snp g.42130668C>T -22 42526694 A 42540366 G snp g.42130692G>A -22 42526712 A 42540384 G snp g.42130710G>A -22 42526717 T 42540389 C gene_conversion g.42130715C>T -22 42526721 A 42540393 G snp g.42130719G>A -22 42526763 T 42540435 C gene_conversion g.42130761C>T -22 42526775 T 42540447 C snp g.42130773C>T -22 42526780 A 42540452 G snp g.42130778G>A diff --git a/data/full_star_table/CYP2D6_target_variant_38.txt b/data/full_star_table/CYP2D6_target_variant_38.txt deleted file mode 100644 index 8378030..0000000 --- a/data/full_star_table/CYP2D6_target_variant_38.txt +++ /dev/null @@ -1,80 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42126611 GTCACCAGGAAA 42140315 CTCACCAGGAAA gene_conversion g.42126611C>G -chr22 42127457 T 42141162 C snp g.42127457C>T -chr22 42127473 T 42141178 C snp g.42127473C>T -chr22 42127514 G 42141219 A snp g.42127514A>G -chr22 42127523 G 42141228 A snp g.42127523A>G -chr22 42127533 CACC 42141238 CC indel g.42127533-42127534insAC -chr22 42127537 ATGAATCACGGCAGTGGTGCAGGGCATGC 42141242 ATGAATCACGGCAGTGGTGTAGGGCATGT,ATGAATCACGGCAGTGGTGTAGGGCATGC,ATGAATCACGGCAGTGGTGCAGGGCATGT snp g.42127565T>C -chr22 42127590 A 42141295 G snp g.42127590G>A -chr22 42127593 C 42141298 G snp g.42127593G>C -chr22 42127608 T 42141313 C snp g.42127608C>T -chr22 42127610 C 42141315 T snp g.42127610T>C -chr22 42127619 G 42141324 T snp g.42127619T>G -chr22 42127803 T 42141495 C snp g.42127803C>T -chr22 42127841 G 42141533 C snp g.42127841C>G -chr22 42127845 GAT 42141537 GCAC indel g.42127846delCACATCCGGATGTAGGATC -chr22 42127852 T 42141544 C snp g.42127852C>T -chr22 42127853 A 42141545 G gene_conversion g.42127853G>A -chr22 42127856 G 42141548 T snp g.42127856T>G -chr22 42127938 G 42141630 T snp g.42127938T>G -chr22 42127941 A 42141633 G gene_conversion g.42127941G>A -chr22 42127962 TC 42141654 TGC indel g.42127963delG -chr22 42128173 CCTCCA 42141867 CCTTCTCC indel g.42128174delCTT -chr22 42128181 T 42141875 A gene_conversion g.42128181A>T -chr22 42128185 T 42141879 C gene_conversion g.42128185C>T -chr22 42128200 CAGGT 42141894 CAGTCA indel g.42128201delAGTC -chr22 42128211 CGGGGGGGGC 42141905 CGGGGGGGC,CGGGGTGGC indel g.42128218-42128219insG -chr22 42128212 A 42141906 G snp g.42128212G>A -chr22 42128215 A 42141909 G snp g.42128215G>A -chr22 42128217 T 42141911 G snp g.42128217G>T -chr22 42128235 A 42141929 G snp g.42128235G>A -chr22 42128241 CGT 42141935 CTGT indel g.42128242delT -chr22 42128248 CAGCA 42141942 CAGTT indel g.42128249delAGTT -chr22 42128272 G 42141966 T snp g.42128272T>G -chr22 42128308 A 42142002 C snp g.42128308C>A -chr22 42128795 AC 42142490 AGC indel g.42128796delG -chr22 42128814 TCCCCT 42142509 TCCCT indel g.42128817-42128818insC -chr22 42128815 T 42142510 C snp g.42128815C>T -chr22 42128848 T 42142543 C snp g.42128848C>T -chr22 42128903 TTAC 42142598 TC indel g.42128903-42128904insTA -chr22 42128927 tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt 42142622 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAAGGGGCGAAA -chr22 42128927 tGGGGCGAAAGGGGCGAAAGGGGCGt 42142622 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAA -chr22 42128933 A 42142628 G snp g.42128933G>A -chr22 42128945 T 42142640 C snp g.42128945C>T -chr22 42129033 A 42142728 c,t snp_multi g.42129033C>A -chr22 42129033 T 42142728 c,a snp_multi g.42129033C>T -chr22 42129042 C 42142737 T gene_conversion g.42129042T>C -chr22 42129056 G 42142751 C snp g.42129056C>G -chr22 42129071 A 42142766 t,g snp_multi g.42129071T>A -chr22 42129071 G 42142766 t,a snp_multi g.42129071T>G -chr22 42129075 T 42142770 C snp g.42129075C>T -chr22 42129083 CCT 42142778 CACT indel g.42129084delA -chr22 42129087 C 42142782 G snp g.42129087G>C -chr22 42129098 C 42142793 T snp g.42129098T>C -chr22 42129113 G 42142808 A snp g.42129113A>G -chr22 42129132 T 42142827 C snp g.42129132C>T -chr22 42129174 A 42142869 C gene_conversion g.42129174C>A -chr22 42129180 T 42142875 A snp g.42129180A>T -chr22 42129183 T 42142878 C snp g.42129183C>T -chr22 42129754 A 42143426 G snp g.42129754G>A -chr22 42129759 T 42143431 C snp g.42129759C>T -chr22 42129765 CCTGGTA 42143437 TCTGGGT,TCTGGAT gene_conversion g.42129765T>C -chr22 42129770 A 42143442 g,t snp_multi g.42129770G>A -chr22 42129779 G 42143451 A gene_conversion g.42129779A>G -chr22 42129780 T 42143452 C snp g.42129780C>T -chr22 42129809 C 42143481 T gene_conversion g.42129809T>C -chr22 42129819 T 42143491 G gene_conversion g.42129819G>T -chr22 42129821 A 42143493 G snp g.42129821G>A -chr22 42129827 G 42143499 C snp g.42129827C>G -chr22 42129836 A 42143508 G snp g.42129836G>A -chr22 42130654 CAAG 42144326 CAG gene_conversion g.42130655-42130656insA -chr22 42130667 T 42144340 C snp g.42130667C>T -chr22 42130668 T 42144341 C snp g.42130668C>T -chr22 42130692 A 42144365 G snp g.42130692G>A -chr22 42130710 A 42144383 G snp g.42130710G>A -chr22 42130715 T 42144388 C gene_conversion g.42130715C>T -chr22 42130719 A 42144392 G snp g.42130719G>A -chr22 42130761 T 42144434 C gene_conversion g.42130761C>T -chr22 42130773 T 42144446 C snp g.42130773C>T -chr22 42130778 A 42144451 G snp g.42130778G>A diff --git a/data/full_star_table/CYP2D6_target_variant_homology_region_19.txt b/data/full_star_table/CYP2D6_target_variant_homology_region_19.txt deleted file mode 100644 index b66787e..0000000 --- a/data/full_star_table/CYP2D6_target_variant_homology_region_19.txt +++ /dev/null @@ -1,18 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42522607 A 42536320 G snp g.42126605G>A -chr22 42522636 C 42536349 A snp g.42126634A>C -chr22 42522659 CAGTGGGCACAGTGGGCACC 42536372 CAGTGGGCACC,CGGCGGCCACG indel g.42126666-42126667insAGTGGGCAC -chr22 42522683 C 42536396 G snp g.42126681G>C -chr22 42522699 T 42536412 G snp g.42126697G>T -chr22 42522737 T 42536450 C snp g.42126735C>T -chr22 42522748 T 42536461 C snp g.42126746C>T -chr22 42522749 A 42536462 G snp g.42126747G>A -chr22 42522751 T 42536464 C snp g.42126749C>T -chr22 42522754 T 42536467 C snp g.42126752C>T -chr22 42522898 A 42536611 G snp g.42126896G>A -chr22 42522906 G 42536619 A snp g.42126904A>G -chr22 42522916 G 42536629 c,t snp_multi g.42126914C>G -chr22 42522916 T 42536629 c,g snp_multi g.42126914C>T -chr22 42522958 G 42536671 T snp g.42126956T>G -chr22 42525908 A 42539579 G snp g.42129906G>A -chr22 42525912 G 42539583 C snp g.42129910C>G diff --git a/data/full_star_table/CYP2D6_target_variant_homology_region_37.txt b/data/full_star_table/CYP2D6_target_variant_homology_region_37.txt deleted file mode 100644 index 41a489b..0000000 --- a/data/full_star_table/CYP2D6_target_variant_homology_region_37.txt +++ /dev/null @@ -1,18 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -22 42522607 A 42536320 G snp g.42126605G>A -22 42522636 C 42536349 A snp g.42126634A>C -22 42522659 CAGTGGGCACAGTGGGCACC 42536372 CAGTGGGCACC,CGGCGGCCACG indel g.42126666-42126667insAGTGGGCAC -22 42522683 C 42536396 G snp g.42126681G>C -22 42522699 T 42536412 G snp g.42126697G>T -22 42522737 T 42536450 C snp g.42126735C>T -22 42522748 T 42536461 C snp g.42126746C>T -22 42522749 A 42536462 G snp g.42126747G>A -22 42522751 T 42536464 C snp g.42126749C>T -22 42522754 T 42536467 C snp g.42126752C>T -22 42522898 A 42536611 G snp g.42126896G>A -22 42522906 G 42536619 A snp g.42126904A>G -22 42522916 G 42536629 c,t snp_multi g.42126914C>G -22 42522916 T 42536629 c,g snp_multi g.42126914C>T -22 42522958 G 42536671 T snp g.42126956T>G -22 42525908 A 42539579 G snp g.42129906G>A -22 42525912 G 42539583 C snp g.42129910C>G diff --git a/data/full_star_table/CYP2D6_target_variant_homology_region_38.txt b/data/full_star_table/CYP2D6_target_variant_homology_region_38.txt deleted file mode 100644 index 751a942..0000000 --- a/data/full_star_table/CYP2D6_target_variant_homology_region_38.txt +++ /dev/null @@ -1,18 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42126605 A 42140309 G snp g.42126605G>A -chr22 42126634 C 42140338 A snp g.42126634A>C -chr22 42126657 CAGTGGGCACAGTGGGCACC 42140361 CAGTGGGCACC,CGGCGGCCACG indel g.42126666-42126667insAGTGGGCAC -chr22 42126681 C 42140385 G snp g.42126681G>C -chr22 42126697 T 42140401 G snp g.42126697G>T -chr22 42126735 T 42140439 C snp g.42126735C>T -chr22 42126746 T 42140450 C snp g.42126746C>T -chr22 42126747 A 42140451 G snp g.42126747G>A -chr22 42126749 T 42140453 C snp g.42126749C>T -chr22 42126752 T 42140456 C snp g.42126752C>T -chr22 42126896 A 42140600 G snp g.42126896G>A -chr22 42126904 G 42140608 A snp g.42126904A>G -chr22 42126914 G 42140618 c,t snp_multi g.42126914C>G -chr22 42126914 T 42140618 c,g snp_multi g.42126914C>T -chr22 42126956 G 42140660 T snp g.42126956T>G -chr22 42129906 A 42143578 G snp g.42129906G>A -chr22 42129910 G 42143582 C snp g.42129910C>G diff --git a/data/full_star_table/star_table.txt b/data/full_star_table/star_table.txt deleted file mode 100644 index dba10d9..0000000 --- a/data/full_star_table/star_table.txt +++ /dev/null @@ -1,115 +0,0 @@ -*1 NA Normal function -*2 g.42127941G>A g.42126611C>G Normal function -*3A g.42128242delT No function -*3B g.42129042T>C g.42128242delT No function -*4A g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126611C>G No function -*4C g.42130692G>A g.42128945C>T g.42126904A>G g.42126611C>G No function -*4D g.42130692G>A g.42128945C>T g.42126611C>G g.42129754G>A No function -*4F g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42128933G>A g.42126611C>G No function -*4G g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42127853G>A g.42126611C>G No function -*4H g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126914C>G g.42126611C>G No function -*4J g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T No function -*4K g.42130692G>A g.42128945C>T g.42127941G>A g.42126611C>G No function -*4M g.42129819G>T g.42129809T>C g.42128945C>T No function -*4N g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T exon9gc g.42126611C>G No function -*4P g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42128215G>A No function -*6A g.42129084delA No function -*6B g.42129084delA g.42128815C>T No function -*6C g.42129084delA g.42128815C>T g.42126611C>G No function -*7 g.42127856T>G No function -*8 g.42129033C>A g.42127941G>A g.42126611C>G No function -*9 g.42128174delCTT Decreased function -*10 g.42129754G>A g.42130692G>A g.42126611C>G Decreased function -*11 g.42129910C>G g.42127941G>A g.42126611C>G No function -*12 g.42130668C>T g.42127941G>A g.42126611C>G No function -*14 g.42129033C>T g.42127941G>A g.42126611C>G Decreased function -*15 g.42130655-42130656insA No function -*17 g.42129770G>A g.42127941G>A g.42126611C>G Decreased function -*18 g.42126666-42126667insAGTGGGCAC No function -*19 g.42128249delAGTT g.42127941G>A g.42126611C>G No function -*20 g.42128817-42128818insC g.42127941G>A g.42126611C>G No function -*21 g.42128218-42128219insG g.42127941G>A g.42126611C>G No function -*22 g.42130710G>A Uncertain function -*23 g.42129836G>A Uncertain function -*24 g.42127938T>G Uncertain function -*25 g.42127593G>C Uncertain function -*26 g.42127514A>G Uncertain function -*27 g.42126938C>T Normal function -*28 g.42130773C>T g.42129087G>C g.42127941G>A g.42126611C>G Uncertain function -*29 g.42129132C>T g.42127941G>A g.42127608C>T g.42126611C>G Decreased function -*30 g.42128936-42128937insGGGGCGAAA g.42127941G>A g.42126611C>G Uncertain function -*31 g.42127941G>A g.42126749C>T g.42126611C>G No function -*32 g.42127941G>A g.42127803C>T g.42126938C>T g.42126611C>G Unknown function -*33 g.42128308C>A Normal function -*34 g.42127941G>A Normal function -*35 g.42130761C>T g.42127941G>A g.42126611C>G Normal function -*36 g.42130692G>A exon9gc g.42126611C>G g.42129754G>A No function -*37 g.42130692G>A g.42128848C>T g.42126611C>G g.42129754G>A Uncertain function -*38 g.42128201delAGTC No function -*39 g.42126611C>G Normal function -*40 g.42129770G>A g.42128936-42128937insGGGGCGAAAGGGGCGAAA g.42127941G>A g.42126611C>G No function -*41 g.42127941G>A g.42127803C>T g.42126611C>G Decreased function -*42 g.42127941G>A g.42127533-42127534insAC g.42126611C>G No function -*43 g.42130715C>T Uncertain function -*44 g.42130710G>A g.42127841C>G No function -*45 g.42129075C>T g.42127941G>A g.42126611C>G Normal function -*46 g.42130715C>T g.42129075C>T g.42127941G>A g.42126611C>G Normal function -*47 g.42130719G>A g.42130692G>A g.42126611C>G g.42129754G>A No function -*48 g.42129821G>A Normal function -*49 g.42130692G>A g.42129180A>T g.42126611C>G g.42129754G>A Decreased function -*50 g.42129071T>G Decreased function -*51 g.42127941G>A g.42127619T>G g.42126611C>G No function -*52 g.42130692G>A g.42126914C>T g.42126611C>G g.42129754G>A Uncertain function -*53 g.42129180A>T g.42129174C>A Normal function -*54 g.42130692G>A g.42128235G>A g.42126611C>G g.42129754G>A Decreased function -*55 g.42127941G>A g.42126956T>G g.42126611C>G Decreased function -*56A g.42127941G>A g.42127590G>A g.42126611C>G No function -*56B g.42130692G>A g.42127590G>A g.42126611C>G g.42129754G>A No function -*57 g.42130692G>A g.42129906G>A exon9gc g.42126611C>G g.42129754G>A No function -*58 g.42129770G>A g.42128936-42128937insGGGGCGAAA g.42127941G>A g.42126611C>G Unknown function -*59 g.42127941G>A g.42127852C>T g.42126611C>G Decreased function -*60 g.42128903-42128904insTA No function -*62 g.42126747G>A No function -*64 g.42130692G>A g.42129770G>A g.42126611C>G Uncertain function -*65 g.42130692G>A g.42127941G>A g.42126611C>G Uncertain function -*69 g.42130692G>A g.42127941G>A g.42127803C>T g.42126611C>G No function -*70 g.42129183C>T g.42129132C>T g.42127608C>T g.42126611C>G Uncertain function -*71 g.42130667C>T Uncertain function -*72 g.42130692G>A g.42127473C>T g.42126611C>G g.42129754G>A Decreased function -*73 g.42129780C>T g.42127941G>A g.42126611C>G Unknown function -*74 g.42129819G>T Unknown function -*75 g.42126746C>T Uncertain function -*81 g.42128212G>A g.42128185C>T g.42128181A>T Uncertain function -*82 g.42129819G>T g.42129809T>C g.42129779A>G g.42129765T>C Unknown function -*83 exon9gc g.42126611C>G Unknown function -*84 g.42128217G>T g.42127941G>A g.42126611C>G Decreased function -*85 g.42127941G>A g.42126634A>C g.42126611C>G Unknown function -*86 g.42128185C>T g.42128181A>T Unknown function -*87 g.42130778G>A g.42130692G>A g.42126611C>G g.42129754G>A Uncertain function -*88 g.42129779A>G g.42126611C>G Uncertain function -*89 g.42129113A>G Uncertain function -*90 g.42129098T>C Uncertain function -*91 g.42129056C>G g.42127941G>A g.42127803C>T Uncertain function -*92 g.42128796delG No function -*93 g.42128272T>G Uncertain function -*94 g.42130692G>A g.42127610T>C g.42126611C>G g.42129754G>A Uncertain function -*95 g.42130692G>A g.42127457C>T g.42126611C>G g.42129754G>A Uncertain function -*96 g.42126896G>A No function -*97 g.42126697G>T Uncertain function -*98 g.42127941G>A g.42126681G>C g.42126611C>G Uncertain function -*99 g.42130692G>A g.42129827C>G g.42126611C>G g.42129754G>A No function -*100 g.42130692G>A g.42127963delG g.42126611C>G g.42129754G>A No function -*101 g.42130692G>A g.42127846delCACATCCGGATGTAGGATC g.42126611C>G g.42129754G>A No function -*102 g.42129821G>A g.42127941G>A g.42126611C>G Unknown function -*103 g.42129821G>A g.42129042T>C g.42127941G>A g.42126611C>G Unknown function -*104 g.42129071T>A g.42127941G>A g.42126611C>G Unknown function -*105 g.42127941G>A g.42127523A>G g.42126611C>G Unknown function -*106 g.42126914C>T Uncertain function -*107 g.42129132C>T Unknown function -*108 g.42127565T>C Unknown function -*109 g.42128174delCTT g.42127608C>T Unknown function -*110 g.42126735C>T Unknown function -*111 g.42129759C>T g.42127941G>A g.42126611C>G Unknown function -*112 g.42126605G>A Unknown function -*113 g.42126752C>T Unknown function -*114 g.42130692G>A g.42129033C>T g.42127941G>A g.42126611C>G No function diff --git a/data/include_new_star_table/star_table.txt b/data/include_new_star_table/star_table.txt deleted file mode 100644 index 50a38d5..0000000 --- a/data/include_new_star_table/star_table.txt +++ /dev/null @@ -1,133 +0,0 @@ -*1 NA Normal function -*2 g.42127941G>A g.42126611C>G Normal function -*3A g.42128242delT No function -*3B g.42129042T>C g.42128242delT No function -*4A g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126611C>G No function -*4C g.42130692G>A g.42128945C>T g.42126904A>G g.42126611C>G No function -*4D g.42130692G>A g.42128945C>T g.42126611C>G g.42129754G>A No function -*4F g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42128933G>A g.42126611C>G No function -*4G g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42127853G>A g.42126611C>G No function -*4H g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126914C>G g.42126611C>G No function -*4J g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T No function -*4K g.42130692G>A g.42128945C>T g.42127941G>A g.42126611C>G No function -*4M g.42129819G>T g.42129809T>C g.42128945C>T No function -*4N g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T exon9gc g.42126611C>G No function -*4P g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42128215G>A No function -*6A g.42129084delA No function -*6B g.42129084delA g.42128815C>T No function -*6C g.42129084delA g.42128815C>T g.42126611C>G No function -*7 g.42127856T>G No function -*8 g.42129033C>A g.42127941G>A g.42126611C>G No function -*9 g.42128174delCTT Decreased function -*10 g.42129754G>A g.42130692G>A g.42126611C>G Decreased function -*11 g.42129910C>G g.42127941G>A g.42126611C>G No function -*12 g.42130668C>T g.42127941G>A g.42126611C>G No function -*14 g.42129033C>T g.42127941G>A g.42126611C>G Decreased function -*15 g.42130655-42130656insA No function -*17 g.42129770G>A g.42127941G>A g.42126611C>G Decreased function -*18 g.42126666-42126667insAGTGGGCAC No function -*19 g.42128249delAGTT g.42127941G>A g.42126611C>G No function -*20 g.42128817-42128818insC g.42127941G>A g.42126611C>G No function -*21 g.42128218-42128219insG g.42127941G>A g.42126611C>G No function -*22 g.42130710G>A Uncertain function -*23 g.42129836G>A Uncertain function -*24 g.42127938T>G Uncertain function -*25 g.42127593G>C Uncertain function -*26 g.42127514A>G Uncertain function -*27 g.42126938C>T Normal function -*28 g.42130773C>T g.42129087G>C g.42127941G>A g.42126611C>G Uncertain function -*29 g.42129132C>T g.42127941G>A g.42127608C>T g.42126611C>G Decreased function -*30 g.42128936-42128937insGGGGCGAAA g.42127941G>A g.42126611C>G Uncertain function -*31 g.42127941G>A g.42126749C>T g.42126611C>G No function -*32 g.42127941G>A g.42127803C>T g.42126938C>T g.42126611C>G Unknown function -*33 g.42128308C>A Normal function -*34 g.42127941G>A Normal function -*35 g.42130761C>T g.42127941G>A g.42126611C>G Normal function -*36 g.42130692G>A exon9gc g.42126611C>G g.42129754G>A No function -*37 g.42130692G>A g.42128848C>T g.42126611C>G g.42129754G>A Uncertain function -*38 g.42128201delAGTC No function -*39 g.42126611C>G Normal function -*40 g.42129770G>A g.42128936-42128937insGGGGCGAAAGGGGCGAAA g.42127941G>A g.42126611C>G No function -*41 g.42127941G>A g.42127803C>T g.42126611C>G Decreased function -*42 g.42127941G>A g.42127533-42127534insAC g.42126611C>G No function -*43 g.42130715C>T Uncertain function -*44 g.42130710G>A g.42127841C>G No function -*45 g.42129075C>T g.42127941G>A g.42126611C>G Normal function -*46 g.42130715C>T g.42129075C>T g.42127941G>A g.42126611C>G Normal function -*47 g.42130719G>A g.42130692G>A g.42126611C>G g.42129754G>A No function -*48 g.42129821G>A Normal function -*49 g.42130692G>A g.42129180A>T g.42126611C>G g.42129754G>A Decreased function -*50 g.42129071T>G Decreased function -*51 g.42127941G>A g.42127619T>G g.42126611C>G No function -*52 g.42130692G>A g.42126914C>T g.42126611C>G g.42129754G>A Uncertain function -*53 g.42129180A>T g.42129174C>A Normal function -*54 g.42130692G>A g.42128235G>A g.42126611C>G g.42129754G>A Decreased function -*55 g.42127941G>A g.42126956T>G g.42126611C>G Decreased function -*56A g.42127941G>A g.42127590G>A g.42126611C>G No function -*56B g.42130692G>A g.42127590G>A g.42126611C>G g.42129754G>A No function -*57 g.42130692G>A g.42129906G>A exon9gc g.42126611C>G g.42129754G>A No function -*58 g.42129770G>A g.42128936-42128937insGGGGCGAAA g.42127941G>A g.42126611C>G Unknown function -*59 g.42127941G>A g.42127852C>T g.42126611C>G Decreased function -*60 g.42128903-42128904insTA No function -*62 g.42126747G>A No function -*64 g.42130692G>A g.42129770G>A g.42126611C>G Uncertain function -*65 g.42130692G>A g.42127941G>A g.42126611C>G Uncertain function -*69 g.42130692G>A g.42127941G>A g.42127803C>T g.42126611C>G No function -*70 g.42129183C>T g.42129132C>T g.42127608C>T g.42126611C>G Uncertain function -*71 g.42130667C>T Uncertain function -*72 g.42130692G>A g.42127473C>T g.42126611C>G g.42129754G>A Decreased function -*73 g.42129780C>T g.42127941G>A g.42126611C>G Unknown function -*74 g.42129819G>T Unknown function -*75 g.42126746C>T Uncertain function -*81 g.42128212G>A g.42128185C>T g.42128181A>T Uncertain function -*82 g.42129819G>T g.42129809T>C g.42129779A>G g.42129765T>C Unknown function -*83 exon9gc g.42126611C>G Unknown function -*84 g.42128217G>T g.42127941G>A g.42126611C>G Decreased function -*85 g.42127941G>A g.42126634A>C g.42126611C>G Unknown function -*86 g.42128185C>T g.42128181A>T Unknown function -*87 g.42130778G>A g.42130692G>A g.42126611C>G g.42129754G>A Uncertain function -*88 g.42129779A>G g.42126611C>G Uncertain function -*89 g.42129113A>G Uncertain function -*90 g.42129098T>C Uncertain function -*91 g.42129056C>G g.42127941G>A g.42127803C>T Uncertain function -*92 g.42128796delG No function -*93 g.42128272T>G Uncertain function -*94 g.42130692G>A g.42127610T>C g.42126611C>G g.42129754G>A Uncertain function -*95 g.42130692G>A g.42127457C>T g.42126611C>G g.42129754G>A Uncertain function -*96 g.42126896G>A No function -*97 g.42126697G>T Uncertain function -*98 g.42127941G>A g.42126681G>C g.42126611C>G Uncertain function -*99 g.42130692G>A g.42129827C>G g.42126611C>G g.42129754G>A No function -*100 g.42130692G>A g.42127963delG g.42126611C>G g.42129754G>A No function -*101 g.42130692G>A g.42127846delCACATCCGGATGTAGGATC g.42126611C>G g.42129754G>A No function -*102 g.42129821G>A g.42127941G>A g.42126611C>G Unknown function -*103 g.42129821G>A g.42129042T>C g.42127941G>A g.42126611C>G Unknown function -*104 g.42129071T>A g.42127941G>A g.42126611C>G Unknown function -*105 g.42127941G>A g.42127523A>G g.42126611C>G Unknown function -*106 g.42126914C>T Uncertain function -*107 g.42129132C>T Unknown function -*108 g.42127565T>C Unknown function -*109 g.42128174delCTT g.42127608C>T Unknown function -*110 g.42126735C>T Unknown function -*111 g.42129759C>T g.42127941G>A g.42126611C>G Unknown function -*112 g.42126605G>A Unknown function -*113 g.42126752C>T Unknown function -*114 g.42130692G>A g.42129033C>T g.42127941G>A g.42126611C>G No function -*115 g.42128174delCTT g.42127973T>C Unknown function -*116 g.42127631C>G Unknown function -*117 g.42127941G>A g.42127611C>T g.42126611C>G Unknown function -*118 g.42127899T>C Unknown function -*120 g.42130728delG Unknown function -*121 g.42129042T>C g.42129036A>C g.42127941G>A g.42126611C>G Unknown function -*123 g.42127941G>A g.42127922A>G g.42127803C>T g.42126611C>G Unknown function -*124 g.42126981_42126982insGA Unknown function -*125 g.42127941G>A g.42126719C>T g.42126611C>G Unknown function -*126 g.42129134G>A g.42127941G>A g.42126611C>G Unknown function -*128 g.42129887A>G g.42127941G>A g.42126611C>G Unknown function -*129 g.42129155C>T g.42127941G>A g.42126611C>G Unknown function -*130 g.42128879A>G g.42128878C>A Unknown function -*132 g.42130692G>A g.42127973T>C g.42129754G>A g.42126611C>G Unknown function -*133 g.42127941G>A g.42127602C>T g.42126611C>G Unknown function -*134 g.42127589C>T Unknown function -*137 g.42126578C>T Unknown function -*138 g.42127941G>A g.42127803C>T g.42126647C>T g.42126611C>G Unknown function diff --git a/data/known_function_star_table/CYP2D6_target_variant_19.txt b/data/known_function_star_table/CYP2D6_target_variant_19.txt deleted file mode 100644 index 6629fd7..0000000 --- a/data/known_function_star_table/CYP2D6_target_variant_19.txt +++ /dev/null @@ -1,48 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42522613 G 42536326 C gene_conversion g.42126611C>G -chr22 42523475 T 42537188 C snp g.42127473C>T -chr22 42523535 CACC 42537248 CC indel g.42127533-42127534insAC -chr22 42523592 A 42537305 G snp g.42127590G>A -chr22 42523610 T 42537323 C snp g.42127608C>T -chr22 42523621 G 42537334 T snp g.42127619T>G -chr22 42523805 T 42537505 C snp g.42127803C>T -chr22 42523843 G 42537543 C snp g.42127841C>G -chr22 42523847 GAT 42537547 GCAC indel g.42127846delCACATCCGGATGTAGGATC -chr22 42523854 T 42537554 C snp g.42127852C>T -chr22 42523855 A 42537555 G gene_conversion g.42127853G>A -chr22 42523858 G 42537558 T snp g.42127856T>G -chr22 42523943 A 42537643 G gene_conversion g.42127941G>A -chr22 42523964 TC 42537664 TGC indel g.42127963delG -chr22 42524175 CCTCCA 42537877 CCTTCTCC indel g.42128174delCTT -chr22 42524202 CAGGT 42537904 CAGTCA indel g.42128201delAGTC -chr22 42524213 CGGGGGGGGC 42537915 CGGGGGGGC,CGGGGTGGC indel g.42128218-42128219insG -chr22 42524217 A 42537919 G snp g.42128215G>A -chr22 42524219 T 42537921 G snp g.42128217G>T -chr22 42524237 A 42537939 G snp g.42128235G>A -chr22 42524243 CGT 42537945 CTGT indel g.42128242delT -chr22 42524250 CAGCA 42537952 CAGTT indel g.42128249delAGTT -chr22 42524310 A 42538012 C snp g.42128308C>A -chr22 42524797 AC 42538491 AGC indel g.42128796delG -chr22 42524816 TCCCCT 42538510 TCCCT indel g.42128817-42128818insC -chr22 42524905 TTAC 42538599 TC indel g.42128903-42128904insTA -chr22 42524929 tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt 42538623 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAAGGGGCGAAA -chr22 42524935 A 42538629 G snp g.42128933G>A -chr22 42524947 T 42538641 C snp g.42128945C>T -chr22 42525035 A 42538729 c,t snp_multi g.42129033C>A -chr22 42525035 T 42538729 c,a snp_multi g.42129033C>T -chr22 42525073 G 42538767 t,a snp_multi g.42129071T>G -chr22 42525077 T 42538771 C snp g.42129075C>T -chr22 42525085 CCT 42538779 CACT indel g.42129084delA -chr22 42525134 T 42538828 C snp g.42129132C>T -chr22 42525176 A 42538870 C gene_conversion g.42129174C>A -chr22 42525182 T 42538876 A snp g.42129180A>T -chr22 42525756 A 42539427 G snp g.42129754G>A -chr22 42525772 A 42539443 g,t snp_multi g.42129770G>A -chr22 42525811 C 42539482 T gene_conversion g.42129809T>C -chr22 42525821 T 42539492 G gene_conversion g.42129819G>T -chr22 42525829 G 42539500 C snp g.42129827C>G -chr22 42526656 CAAG 42540327 CAG gene_conversion g.42130655-42130656insA -chr22 42526670 T 42540342 C snp g.42130668C>T -chr22 42526694 A 42540366 G snp g.42130692G>A -chr22 42526721 A 42540393 G snp g.42130719G>A -chr22 42526763 T 42540435 C gene_conversion g.42130761C>T diff --git a/data/known_function_star_table/CYP2D6_target_variant_37.txt b/data/known_function_star_table/CYP2D6_target_variant_37.txt deleted file mode 100644 index 47ca847..0000000 --- a/data/known_function_star_table/CYP2D6_target_variant_37.txt +++ /dev/null @@ -1,48 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -22 42522613 G 42536326 C gene_conversion g.42126611C>G -22 42523475 T 42537188 C snp g.42127473C>T -22 42523535 CACC 42537248 CC indel g.42127533-42127534insAC -22 42523592 A 42537305 G snp g.42127590G>A -22 42523610 T 42537323 C snp g.42127608C>T -22 42523621 G 42537334 T snp g.42127619T>G -22 42523805 T 42537505 C snp g.42127803C>T -22 42523843 G 42537543 C snp g.42127841C>G -22 42523847 GAT 42537547 GCAC indel g.42127846delCACATCCGGATGTAGGATC -22 42523854 T 42537554 C snp g.42127852C>T -22 42523855 A 42537555 G gene_conversion g.42127853G>A -22 42523858 G 42537558 T snp g.42127856T>G -22 42523943 A 42537643 G gene_conversion g.42127941G>A -22 42523964 TC 42537664 TGC indel g.42127963delG -22 42524175 CCTCCA 42537877 CCTTCTCC indel g.42128174delCTT -22 42524202 CAGGT 42537904 CAGTCA indel g.42128201delAGTC -22 42524213 CGGGGGGGGC 42537915 CGGGGGGGC,CGGGGTGGC indel g.42128218-42128219insG -22 42524217 A 42537919 G snp g.42128215G>A -22 42524219 T 42537921 G snp g.42128217G>T -22 42524237 A 42537939 G snp g.42128235G>A -22 42524243 CGT 42537945 CTGT indel g.42128242delT -22 42524250 CAGCA 42537952 CAGTT indel g.42128249delAGTT -22 42524310 A 42538012 C snp g.42128308C>A -22 42524797 AC 42538491 AGC indel g.42128796delG -22 42524816 TCCCCT 42538510 TCCCT indel g.42128817-42128818insC -22 42524905 TTAC 42538599 TC indel g.42128903-42128904insTA -22 42524929 tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt 42538623 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAAGGGGCGAAA -22 42524935 A 42538629 G snp g.42128933G>A -22 42524947 T 42538641 C snp g.42128945C>T -22 42525035 A 42538729 c,t snp_multi g.42129033C>A -22 42525035 T 42538729 c,a snp_multi g.42129033C>T -22 42525073 G 42538767 t,a snp_multi g.42129071T>G -22 42525077 T 42538771 C snp g.42129075C>T -22 42525085 CCT 42538779 CACT indel g.42129084delA -22 42525134 T 42538828 C snp g.42129132C>T -22 42525176 A 42538870 C gene_conversion g.42129174C>A -22 42525182 T 42538876 A snp g.42129180A>T -22 42525756 A 42539427 G snp g.42129754G>A -22 42525772 A 42539443 g,t snp_multi g.42129770G>A -22 42525811 C 42539482 T gene_conversion g.42129809T>C -22 42525821 T 42539492 G gene_conversion g.42129819G>T -22 42525829 G 42539500 C snp g.42129827C>G -22 42526656 CAAG 42540327 CAG gene_conversion g.42130655-42130656insA -22 42526670 T 42540342 C snp g.42130668C>T -22 42526694 A 42540366 G snp g.42130692G>A -22 42526721 A 42540393 G snp g.42130719G>A -22 42526763 T 42540435 C gene_conversion g.42130761C>T diff --git a/data/known_function_star_table/CYP2D6_target_variant_38.txt b/data/known_function_star_table/CYP2D6_target_variant_38.txt deleted file mode 100644 index 821c617..0000000 --- a/data/known_function_star_table/CYP2D6_target_variant_38.txt +++ /dev/null @@ -1,48 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42126611 G 42140315 C gene_conversion g.42126611C>G -chr22 42127473 T 42141178 C snp g.42127473C>T -chr22 42127533 CACC 42141238 CC indel g.42127533-42127534insAC -chr22 42127590 A 42141295 G snp g.42127590G>A -chr22 42127608 T 42141313 C snp g.42127608C>T -chr22 42127619 G 42141324 T snp g.42127619T>G -chr22 42127803 T 42141495 C snp g.42127803C>T -chr22 42127841 G 42141533 C snp g.42127841C>G -chr22 42127845 GAT 42141537 GCAC indel g.42127846delCACATCCGGATGTAGGATC -chr22 42127852 T 42141544 C snp g.42127852C>T -chr22 42127853 A 42141545 G gene_conversion g.42127853G>A -chr22 42127856 G 42141548 T snp g.42127856T>G -chr22 42127941 A 42141633 G gene_conversion g.42127941G>A -chr22 42127962 TC 42141654 TGC indel g.42127963delG -chr22 42128173 CCTCCA 42141867 CCTTCTCC indel g.42128174delCTT -chr22 42128200 CAGGT 42141894 CAGTCA indel g.42128201delAGTC -chr22 42128211 CGGGGGGGGC 42141905 CGGGGGGGC,CGGGGTGGC indel g.42128218-42128219insG -chr22 42128215 A 42141909 G snp g.42128215G>A -chr22 42128217 T 42141911 G snp g.42128217G>T -chr22 42128235 A 42141929 G snp g.42128235G>A -chr22 42128241 CGT 42141935 CTGT indel g.42128242delT -chr22 42128248 CAGCA 42141942 CAGTT indel g.42128249delAGTT -chr22 42128308 A 42142002 C snp g.42128308C>A -chr22 42128795 AC 42142490 AGC indel g.42128796delG -chr22 42128814 TCCCCT 42142509 TCCCT indel g.42128817-42128818insC -chr22 42128903 TTAC 42142598 TC indel g.42128903-42128904insTA -chr22 42128927 tGGGGCGAAAGGGGCGAAAGGGGCGAAAGGGGCGt 42142622 tGGGGCGAAAGGGGCGt,tGGGGCGAAAGGGGCGAAAGGGGCGt indel_multi g.42128936-42128937insGGGGCGAAAGGGGCGAAA -chr22 42128933 A 42142628 G snp g.42128933G>A -chr22 42128945 T 42142640 C snp g.42128945C>T -chr22 42129033 A 42142728 c,t snp_multi g.42129033C>A -chr22 42129033 T 42142728 c,a snp_multi g.42129033C>T -chr22 42129071 G 42142766 t,a snp_multi g.42129071T>G -chr22 42129075 T 42142770 C snp g.42129075C>T -chr22 42129083 CCT 42142778 CACT indel g.42129084delA -chr22 42129132 T 42142827 C snp g.42129132C>T -chr22 42129174 A 42142869 C gene_conversion g.42129174C>A -chr22 42129180 T 42142875 A snp g.42129180A>T -chr22 42129754 A 42143426 G snp g.42129754G>A -chr22 42129770 A 42143442 g,t snp_multi g.42129770G>A -chr22 42129809 C 42143481 T gene_conversion g.42129809T>C -chr22 42129819 T 42143491 G gene_conversion g.42129819G>T -chr22 42129827 G 42143499 C snp g.42129827C>G -chr22 42130654 CAAG 42144326 CAG gene_conversion g.42130655-42130656insA -chr22 42130668 T 42144341 C snp g.42130668C>T -chr22 42130692 A 42144365 G snp g.42130692G>A -chr22 42130719 A 42144392 G snp g.42130719G>A -chr22 42130761 T 42144434 C gene_conversion g.42130761C>T diff --git a/data/known_function_star_table/CYP2D6_target_variant_homology_region_19.txt b/data/known_function_star_table/CYP2D6_target_variant_homology_region_19.txt deleted file mode 100644 index faf68ce..0000000 --- a/data/known_function_star_table/CYP2D6_target_variant_homology_region_19.txt +++ /dev/null @@ -1,10 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42522659 CAGTGGGCACAGTGGGCACC 42536372 CAGTGGGCACC,CGGCGGCCACG indel g.42126666-42126667insAGTGGGCAC -chr22 42522749 A 42536462 G snp g.42126747G>A -chr22 42522751 T 42536464 C snp g.42126749C>T -chr22 42522898 A 42536611 G snp g.42126896G>A -chr22 42522906 G 42536619 A snp g.42126904A>G -chr22 42522916 G 42536629 c,t snp_multi g.42126914C>G -chr22 42522958 G 42536671 T snp g.42126956T>G -chr22 42525908 A 42539579 G snp g.42129906G>A -chr22 42525912 G 42539583 C snp g.42129910C>G diff --git a/data/known_function_star_table/CYP2D6_target_variant_homology_region_37.txt b/data/known_function_star_table/CYP2D6_target_variant_homology_region_37.txt deleted file mode 100644 index 80b50ce..0000000 --- a/data/known_function_star_table/CYP2D6_target_variant_homology_region_37.txt +++ /dev/null @@ -1,10 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -22 42522659 CAGTGGGCACAGTGGGCACC 42536372 CAGTGGGCACC,CGGCGGCCACG indel g.42126666-42126667insAGTGGGCAC -22 42522749 A 42536462 G snp g.42126747G>A -22 42522751 T 42536464 C snp g.42126749C>T -22 42522898 A 42536611 G snp g.42126896G>A -22 42522906 G 42536619 A snp g.42126904A>G -22 42522916 G 42536629 c,t snp_multi g.42126914C>G -22 42522958 G 42536671 T snp g.42126956T>G -22 42525908 A 42539579 G snp g.42129906G>A -22 42525912 G 42539583 C snp g.42129910C>G diff --git a/data/known_function_star_table/CYP2D6_target_variant_homology_region_38.txt b/data/known_function_star_table/CYP2D6_target_variant_homology_region_38.txt deleted file mode 100644 index 63f2b47..0000000 --- a/data/known_function_star_table/CYP2D6_target_variant_homology_region_38.txt +++ /dev/null @@ -1,10 +0,0 @@ -#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name -chr22 42126657 CAGTGGGCACAGTGGGCACC 42140361 CAGTGGGCACC,CGGCGGCCACG indel g.42126666-42126667insAGTGGGCAC -chr22 42126747 A 42140451 G snp g.42126747G>A -chr22 42126749 T 42140453 C snp g.42126749C>T -chr22 42126896 A 42140600 G snp g.42126896G>A -chr22 42126904 G 42140608 A snp g.42126904A>G -chr22 42126914 G 42140618 c,t snp_multi g.42126914C>G -chr22 42126956 G 42140660 T snp g.42126956T>G -chr22 42129906 A 42143578 G snp g.42129906G>A -chr22 42129910 G 42143582 C snp g.42129910C>G diff --git a/data/known_function_star_table/star_table.txt b/data/known_function_star_table/star_table.txt deleted file mode 100644 index cba8726..0000000 --- a/data/known_function_star_table/star_table.txt +++ /dev/null @@ -1,65 +0,0 @@ -*1 NA Normal function -*2 g.42127941G>A g.42126611C>G Normal function -*3 g.42128242delT No function -*4A g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126611C>G No function -*4C g.42130692G>A g.42128945C>T g.42126904A>G g.42126611C>G No function -*4D g.42130692G>A g.42128945C>T g.42126611C>G g.42129754G>A No function -*4F g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42128933G>A g.42126611C>G No function -*4G g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42127853G>A g.42126611C>G No function -*4H g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126914C>G g.42126611C>G No function -*4J g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T No function -*4K g.42130692G>A g.42128945C>T g.42127941G>A g.42126611C>G No function -*4M g.42129819G>T g.42129809T>C g.42128945C>T No function -*4N g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T exon9gc g.42126611C>G No function -*4P g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42128215G>A No function -*6A g.42129084delA No function -*6C g.42129084delA g.42126611C>G No function -*7 g.42127856T>G No function -*8 g.42129033C>A g.42127941G>A g.42126611C>G No function -*9 g.42128174delCTT Decreased function -*10 g.42129754G>A g.42130692G>A g.42126611C>G Decreased function -*11 g.42129910C>G g.42127941G>A g.42126611C>G No function -*12 g.42130668C>T g.42127941G>A g.42126611C>G No function -*14 g.42129033C>T g.42127941G>A g.42126611C>G Decreased function -*15 g.42130655-42130656insA No function -*17 g.42129770G>A g.42127941G>A g.42126611C>G Decreased function -*18 g.42126666-42126667insAGTGGGCAC No function -*19 g.42128249delAGTT g.42127941G>A g.42126611C>G No function -*20 g.42128817-42128818insC g.42127941G>A g.42126611C>G No function -*21 g.42128218-42128219insG g.42127941G>A g.42126611C>G No function -*27 g.42126938C>T Normal function -*29 g.42129132C>T g.42127941G>A g.42127608C>T g.42126611C>G Decreased function -*31 g.42127941G>A g.42126749C>T g.42126611C>G No function -*33 g.42128308C>A Normal function -*34 g.42127941G>A Normal function -*35 g.42130761C>T g.42127941G>A g.42126611C>G Normal function -*36 g.42130692G>A exon9gc g.42126611C>G g.42129754G>A No function -*38 g.42128201delAGTC No function -*39 g.42126611C>G Normal function -*40 g.42129770G>A g.42128936-42128937insGGGGCGAAAGGGGCGAAA g.42127941G>A g.42126611C>G No function -*41 g.42127941G>A g.42127803C>T g.42126611C>G Decreased function -*42 g.42127941G>A g.42127533-42127534insAC g.42126611C>G No function -*44 g.42127841C>G No function -*45 g.42129075C>T g.42127941G>A g.42126611C>G Normal function -*47 g.42130719G>A g.42130692G>A g.42126611C>G g.42129754G>A No function -*49 g.42130692G>A g.42129180A>T g.42126611C>G g.42129754G>A Decreased function -*50 g.42129071T>G Decreased function -*51 g.42127941G>A g.42127619T>G g.42126611C>G No function -*53 g.42129180A>T g.42129174C>A Normal function -*54 g.42130692G>A g.42128235G>A g.42126611C>G g.42129754G>A Decreased function -*55 g.42127941G>A g.42126956T>G g.42126611C>G Decreased function -*56A g.42127941G>A g.42127590G>A g.42126611C>G No function -*56B g.42130692G>A g.42127590G>A g.42126611C>G g.42129754G>A No function -*57 g.42130692G>A g.42129906G>A exon9gc g.42126611C>G g.42129754G>A No function -*59 g.42127941G>A g.42127852C>T g.42126611C>G Decreased function -*60 g.42128903-42128904insTA No function -*62 g.42126747G>A No function -*69 g.42130692G>A g.42127941G>A g.42127803C>T g.42126611C>G No function -*72 g.42130692G>A g.42127473C>T g.42126611C>G g.42129754G>A Decreased function -*84 g.42128217G>T g.42127941G>A g.42126611C>G Decreased function -*92 g.42128796delG No function -*96 g.42126896G>A No function -*99 g.42130692G>A g.42129827C>G g.42126611C>G g.42129754G>A No function -*100 g.42130692G>A g.42127963delG g.42126611C>G g.42129754G>A No function -*101 g.42130692G>A g.42127846delCACATCCGGATGTAGGATC g.42126611C>G g.42129754G>A No function -*114 g.42130692G>A g.42129033C>T g.42127941G>A g.42126611C>G No function diff --git a/data/star_table.txt b/data/star_table.txt new file mode 100644 index 0000000..063b631 --- /dev/null +++ b/data/star_table.txt @@ -0,0 +1,146 @@ +*1 NA Normal_function +*2 g.42127941G>A g.42126611C>G Normal_function +*3 g.42128242delT No_function +*3.002 g.42129042T>C g.42128242delT No_function +*4 g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126611C>G No_function +*4.003 g.42130692G>A g.42128945C>T g.42126904A>G g.42126611C>G No_function +*4.004 g.42126611C>G g.42128945C>T g.42129754G>A g.42129809T>C g.42129819G>T g.42130692G>A No_function +*4.005 g.42126611C>G g.42128945C>T g.42130692G>A No_function +*4.006 g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42128933G>A g.42126611C>G No_function +*4.007 g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42127853G>A g.42126611C>G No_function +*4.008 g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T g.42126914C>G g.42126611C>G No_function +*4.009 g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T No_function +*4.010 g.42130692G>A g.42128945C>T g.42127941G>A g.42126611C>G No_function +*4.012 g.42129819G>T g.42129809T>C g.42128945C>T No_function +*4.013 g.42130692G>A g.42129819G>T g.42129809T>C g.42128945C>T exon9gc g.42126611C>G No_function +*4.014 g.42126611C>G g.42128215G>A g.42128945C>T g.42129809T>C g.42129819G>T g.42130692G>A No_function +*4.019 g.42130692G>A g.42128945C>T g.42126611C>G g.42129754G>A No_function +*4.021 g.42126611C>G g.42127526C>T g.42128945C>T g.42129809T>C g.42129819G>T g.42130692G>A No_function +*4.027 g.42128945C>T g.42129754G>A g.42130692G>A No_function +*6 g.42129084delA No_function +*6.002 g.42129084delA g.42128815C>T No_function +*6.003 g.42129084delA g.42128815C>T g.42126611C>G No_function +*7 g.42127856T>G No_function +*8 g.42129033C>A g.42127941G>A g.42126611C>G No_function +*9 g.42128174delCTT Decreased_function +*10 g.42129754G>A g.42130692G>A g.42126611C>G Decreased_function +*11 g.42129910C>G g.42127941G>A g.42126611C>G No_function +*12 g.42130668C>T g.42127941G>A g.42126611C>G No_function +*12.002 g.42126611C>G g.42127941G>A g.42129075C>T g.42130668C>T No_function +*14 g.42129033C>T g.42127941G>A g.42126611C>G Decreased_function +*15.001 g.42130655-42130656insA No_function +*15.003 g.42130655-42130656insA g.42130715C>T No_function +*17 g.42129770G>A g.42127941G>A g.42126611C>G Decreased_function +*18 g.42126666-42126667insAGTGGGCAC No_function +*19 g.42128249delAGTT g.42127941G>A g.42126611C>G No_function +*20 g.42128817-42128818insC g.42127941G>A g.42126611C>G No_function +*21 g.42128218-42128219insG g.42127941G>A g.42126611C>G No_function +*22 g.42130710G>A Uncertain_function +*23 g.42129836G>A Uncertain_function +*24 g.42127938T>G Uncertain_function +*25 g.42127593G>C Uncertain_function +*26 g.42127514A>G Uncertain_function +*27 g.42126938C>T Normal_function +*28 g.42130773C>T g.42129087G>C g.42127941G>A g.42126611C>G Uncertain_function +*29 g.42129132C>T g.42127941G>A g.42127608C>T g.42126611C>G Decreased_function +*30 g.42128936-42128937insGGGGCGAAA g.42127941G>A g.42126611C>G Uncertain_function +*31 g.42127941G>A g.42126749C>T g.42126611C>G No_function +*32 g.42127941G>A g.42127803C>T g.42126938C>T g.42126611C>G Uncertain_function +*33 g.42128308C>A Normal_function +*34 g.42127941G>A Normal_function +*35 g.42130761C>T g.42127941G>A g.42126611C>G Normal_function +*36 g.42130692G>A exon9gc g.42126611C>G g.42129754G>A No_function +*37 g.42130692G>A g.42128848C>T g.42126611C>G g.42129754G>A Uncertain_function +*38 g.42128201delAGTC No_function +*39 g.42126611C>G Normal_function +*40 g.42129770G>A g.42128936-42128937insGGGGCGAAAGGGGCGAAA g.42127941G>A g.42126611C>G No_function +*41 g.42127941G>A g.42127803C>T g.42126611C>G Decreased_function +*42 g.42127941G>A g.42127533-42127534insAC g.42126611C>G No_function +*43 g.42130715C>T Uncertain_function +*44 g.42130710G>A g.42127841C>G No_function +*45 g.42129075C>T g.42127941G>A g.42126611C>G Normal_function +*46 g.42130715C>T g.42129075C>T g.42127941G>A g.42126611C>G Normal_function +*47 g.42130719G>A g.42130692G>A g.42126611C>G g.42129754G>A No_function +*48 g.42129821G>A Normal_function +*49 g.42130692G>A g.42129180A>T g.42126611C>G g.42129754G>A Decreased_function +*50 g.42129071T>G Decreased_function +*51 g.42127941G>A g.42127619T>G g.42126611C>G No_function +*52 g.42130692G>A g.42126914C>T g.42126611C>G g.42129754G>A Uncertain_function +*53 g.42129180A>T g.42129174C>A Normal_function +*54 g.42130692G>A g.42128235G>A g.42126611C>G g.42129754G>A Decreased_function +*55 g.42127941G>A g.42126956T>G g.42126611C>G Decreased_function +*56 g.42127941G>A g.42127590G>A g.42126611C>G No_function +*56.002 g.42130692G>A g.42127590G>A g.42126611C>G g.42129754G>A No_function +*57 g.42130692G>A g.42129906G>A exon9gc g.42126611C>G g.42129754G>A No_function +*58 g.42129770G>A g.42128936-42128937insGGGGCGAAA g.42127941G>A g.42126611C>G Unknown_function +*59 g.42127941G>A g.42127852C>T g.42126611C>G Decreased_function +*60 g.42128903-42128904insTA No_function +*62 g.42126747G>A No_function +*64 g.42130692G>A g.42129770G>A g.42126611C>G Uncertain_function +*65 g.42130692G>A g.42127941G>A g.42126611C>G Uncertain_function +*69 g.42130692G>A g.42127941G>A g.42127803C>T g.42126611C>G No_function +*70 g.42129183C>T g.42129132C>T g.42127608C>T g.42126611C>G Uncertain_function +*71 g.42130667C>T Uncertain_function +*72 g.42130692G>A g.42127473C>T g.42126611C>G g.42129754G>A Uncertain_function +*73 g.42129780C>T g.42127941G>A g.42126611C>G Unknown_function +*74 g.42129819G>T Unknown_function +*75 g.42126746C>T Uncertain_function +*81 g.42128212G>A g.42128185C>T g.42128181A>T No_function +*82 g.42129819G>T g.42129809T>C g.42129779A>G g.42129765T>C Unknown_function +*83 exon9gc g.42126611C>G Uncertain_function +*84 g.42128217G>T g.42127941G>A g.42126611C>G Uncertain_function +*85 g.42127941G>A g.42126634A>C g.42126611C>G Unknown_function +*86 g.42128185C>T g.42128181A>T Unknown_function +*87 g.42130778G>A g.42130692G>A g.42126611C>G g.42129754G>A Uncertain_function +*88 g.42129779A>G g.42126611C>G Uncertain_function +*89 g.42129113A>G Uncertain_function +*90 g.42129098T>C Uncertain_function +*91 g.42129056C>G g.42127941G>A g.42127803C>T Uncertain_function +*92 g.42128796delG No_function +*93 g.42128272T>G Uncertain_function +*94 g.42130692G>A g.42127610T>C g.42126611C>G g.42129754G>A Uncertain_function +*95 g.42130692G>A g.42127457C>T g.42126611C>G g.42129754G>A Uncertain_function +*96 g.42126896G>A No_function +*97 g.42126697G>T Uncertain_function +*98 g.42127941G>A g.42126681G>C g.42126611C>G Uncertain_function +*99 g.42130692G>A g.42129827C>G g.42126611C>G g.42129754G>A No_function +*100 g.42130692G>A g.42127963delG g.42126611C>G g.42129754G>A No_function +*101 g.42130692G>A g.42127846delCACATCCGGATGTAGGATC g.42126611C>G g.42129754G>A No_function +*102 g.42129821G>A g.42127941G>A g.42126611C>G Uncertain_function +*103 g.42129821G>A g.42129042T>C g.42127941G>A g.42126611C>G Unknown_function +*104 g.42129071T>A g.42127941G>A g.42126611C>G Unknown_function +*105 g.42127941G>A g.42127523A>G g.42126611C>G Unknown_function +*106 g.42126914C>T Uncertain_function +*107 g.42129132C>T Unknown_function +*108 g.42127556T>C g.42127565T>C Unknown_function +*109 g.42128174delCTT g.42127608C>T Uncertain_function +*110 g.42126735C>T Unknown_function +*111 g.42129759C>T g.42127941G>A g.42126611C>G Unknown_function +*112 g.42126605G>A Unknown_function +*113 g.42126752C>T Unknown_function +*114 g.42130692G>A g.42129033C>T g.42127941G>A g.42126611C>G No_function +*115 g.42128174delCTT g.42127973T>C Unknown_function +*116 g.42127631C>G Unknown_function +*117 g.42127941G>A g.42127611C>T g.42126611C>G Unknown_function +*118 g.42127899T>C Unknown_function +*119 g.42127803C>T Unknown_function +*120 g.42130728delG No_function +*121 g.42129042T>C g.42129036A>C g.42127941G>A g.42126611C>G Unknown_function +*122 g.42127512C>T Unknown_function +*123 g.42127941G>A g.42127922A>G g.42127803C>T g.42126611C>G Unknown_function +*124 g.42126981-42126982insGA No_function +*125 g.42127941G>A g.42126719C>T g.42126611C>G Unknown_function +*126 g.42129134G>A g.42127941G>A g.42126611C>G Unknown_function +*127 g.42127556T>C g.42127526C>T Unknown_function +*128 g.42129887A>G g.42127941G>A g.42126611C>G Unknown_function +*129 g.42129155C>T g.42127941G>A g.42126611C>G No_function +*130 g.42128879A>G g.42128878C>A Unknown_function +*131 g.42128325A>G Unknown_function +*132 g.42130692G>A g.42127973T>C g.42129754G>A g.42126611C>G Unknown_function +*133 g.42127941G>A g.42127602C>T g.42126611C>G Unknown_function +*134 g.42127589C>T Unknown_function +*135 g.42127941G>A g.42126926G>A g.42126611C>G Unknown_function +*136 g.42127941G>A g.42126877G>A g.42126611C>G Unknown_function +*137 g.42126578C>T Unknown_function +*138 g.42127941G>A g.42127803C>T g.42126647C>T g.42126611C>G Unknown_function +*139 g.42127526C>T Unknown_function diff --git a/depth_calling/haplotype.py b/depth_calling/haplotype.py index ac07624..0d7cbfb 100644 --- a/depth_calling/haplotype.py +++ b/depth_calling/haplotype.py @@ -20,7 +20,6 @@ from .snp_count import passing_read -from pprint import pprint def get_haplotypes_from_bam(bamfile_handle, base_db, target_positions): @@ -34,14 +33,29 @@ def get_haplotypes_from_bam(bamfile_handle, base_db, target_positions): return dhaplotype -def get_bases_per_read(bamfile_handle, base_db, target_positions, min_mapq=0): - """ - Extract bases on each read at target positions - """ +def get_haplotypes_from_bam_single_region(bamfile_handle, base_db, target_positions): + dread = {} + dread = get_bases_per_read( + bamfile_handle, base_db, target_positions, region=0, min_mapq=10 + ) + base1, base2 = get_base1_base2(base_db, target_positions) + dhaplotype = get_hap_counts(dread, base1, base2) + return dhaplotype + + +def get_bases_per_read( + bamfile_handle, base_db, target_positions, region=None, min_mapq=0 +): dread = {} nchr = base_db.nchr dindex = base_db.dindex - for dsnp in [base_db.dsnp1, base_db.dsnp2]: + dsnps = [base_db.dsnp1, base_db.dsnp2] + if region is not None: + if region == 0: + dsnps = [base_db.dsnp1] + elif region == 1: + dsnps = [base_db.dsnp2] + for dsnp in dsnps: for snp_position_ori in dsnp: dsnp_index = dindex[snp_position_ori] snp_position = int(snp_position_ori.split("_")[0]) @@ -114,7 +128,6 @@ def get_base1_base2(base_db, target_positions): dindex = base_db.dindex for pos in base_db.dsnp1: dsnp_index = dindex[pos] - snp_position = int(pos.split("_")[0]) if dsnp_index in target_positions: index = int(pos.split("_")[1]) allele1, allele2 = base_db.dsnp1[pos].split("_") diff --git a/depth_calling/snp_count.py b/depth_calling/snp_count.py index 39aac0e..1b509e8 100644 --- a/depth_calling/snp_count.py +++ b/depth_calling/snp_count.py @@ -21,11 +21,9 @@ from collections import namedtuple import pysam -from .utilities import open_alignment_file COMPLEMENT = {"A": "T", "T": "A", "C": "G", "G": "C", "N": "N"} -SITES_STRINGENT = [] # consider being more stringent for exon8 site for SMN def reverse_complement(sequence): @@ -41,7 +39,7 @@ def get_nm(ltag): return None -def get_snp_position(pos_file): +def get_snp_position(pos_file, group=None): """Get all base differences listed in the SNP location file.""" dsnp1 = {} dsnp2 = {} @@ -51,24 +49,27 @@ def get_snp_position(pos_file): counter = -1 for line in read_pos: if line[0] != "#" and line[0] != "\n": - counter += 1 split_line = line.strip().split() - reg1_name = split_line[1] + "_" + str(counter) - reg2_name = split_line[3] + "_" + str(counter) - reg1_base = split_line[2].upper() - reg2_base = split_line[4].upper() - if split_line[-1] != "-": - dsnp1.setdefault(reg1_name, "_".join([reg1_base, reg2_base])) - dsnp2.setdefault(reg2_name, "_".join([reg1_base, reg2_base])) - else: - dsnp1.setdefault( - reg1_name, "_".join([reg1_base, reverse_complement(reg2_base)]) - ) - dsnp2.setdefault( - reg2_name, "_".join([reverse_complement(reg1_base), reg2_base]) - ) - dindex.setdefault(reg1_name, counter) - dindex.setdefault(reg2_name, counter) + if group is None or split_line[-1] == group: + counter += 1 + reg1_name = split_line[1] + "_" + str(counter) + reg2_name = split_line[3] + "_" + str(counter) + reg1_base = split_line[2].upper() + reg2_base = split_line[4].upper() + if split_line[-1] != "-": + dsnp1.setdefault(reg1_name, "_".join([reg1_base, reg2_base])) + dsnp2.setdefault(reg2_name, "_".join([reg1_base, reg2_base])) + else: + dsnp1.setdefault( + reg1_name, + "_".join([reg1_base, reverse_complement(reg2_base)]), + ) + dsnp2.setdefault( + reg2_name, + "_".join([reverse_complement(reg1_base), reg2_base]), + ) + dindex.setdefault(reg1_name, counter) + dindex.setdefault(reg2_name, counter) nchr = split_line[0] snp_lookup = namedtuple("snp_lookup", "dsnp1 dsnp2 nchr dindex") dbsnp = snp_lookup(dsnp1, dsnp2, nchr, dindex) @@ -91,14 +92,16 @@ def passing_read_stringent(pileupread): number_mismatch = get_nm(pileupread.alignment.tags) align_len = pileupread.alignment.query_alignment_length read_len = len(pileupread.alignment.query_sequence) - return ( - number_mismatch <= float(align_len) * 0.08 - and pileupread.query_position > 0 - and pileupread.query_position < read_len - 1 - ) + insert_size = pileupread.alignment.template_length + # number_mismatch <= float(align_len) * 0.08 + # and pileupread.query_position > 0 + # and pileupread.query_position < read_len - 1 + return abs(insert_size) < 1000 -def get_reads_by_region(bamfile_handle, nchr, dsnp, dindex, min_mapq=0): +def get_reads_by_region( + bamfile_handle, nchr, dsnp, dindex, min_mapq=0, stringent=False +): """ Return the number of reads supporting region1 and region2, forward and reverse. """ @@ -134,10 +137,7 @@ def get_reads_by_region(bamfile_handle, nchr, dsnp, dindex, min_mapq=0): dsnp_index = dindex[snp_position_ori] read_name = read.alignment.query_name read_seq = read.alignment.query_sequence - if ( - site_position not in SITES_STRINGENT - or passing_read_stringent(read) - ): + if stringent is False or passing_read_stringent(read): reg1_allele_split = reg1_allele.split(",") reg2_allele_split = reg2_allele.split(",") start_pos = read.query_position @@ -183,12 +183,11 @@ def merge_reads(list_to_merge): return merged_reads -def get_supporting_reads(bamf, dsnp1, dsnp2, nchr, dindex, reference=None): +def get_supporting_reads(bamfile_handle, dsnp1, dsnp2, nchr, dindex): """ Return the number of supporting reads at each position in both region1 and region2. """ - bamfile_handle = open_alignment_file(bamf, reference) assert len(dsnp1) == len(dsnp2) # Go through SNP sites in both regions, # and count the number of reads supporting each gene. @@ -204,20 +203,17 @@ def get_supporting_reads(bamf, dsnp1, dsnp2, nchr, dindex, reference=None): lsnp2 = merge_reads( [lsnp2_reg1_for, lsnp2_reg1_rev, lsnp2_reg2_for, lsnp2_reg2_rev] ) - bamfile_handle.close() return [len(a) for a in lsnp1], [len(a) for a in lsnp2] -def get_supporting_reads_single_region(bamf, dsnp1, nchr, dindex, reference=None): +def get_supporting_reads_single_region(bamfile_handle, dsnp1, nchr, dindex): """ Return the number of supporting reads at each position only in region1, as well as the number of alt reads in forward and reverse. """ - bamfile_handle = open_alignment_file(bamf, reference) lsnp1_for, lsnp1_rev, lsnp2_for, lsnp2_rev = get_reads_by_region( bamfile_handle, nchr, dsnp1, dindex, 10 ) - bamfile_handle.close() lsnp1 = merge_reads([lsnp1_for, lsnp1_rev]) lsnp2 = merge_reads([lsnp2_for, lsnp2_rev]) return ( diff --git a/depth_calling/tests/test_snp_count.py b/depth_calling/tests/test_snp_count.py index 30ee47e..01ce093 100644 --- a/depth_calling/tests/test_snp_count.py +++ b/depth_calling/tests/test_snp_count.py @@ -29,6 +29,7 @@ get_supporting_reads_single_region, get_fraction, ) +from ..utilities import open_alignment_file TOTAL_NUM_SITES = 16 test_data_dir = os.path.join(os.path.dirname(__file__), "test_data") @@ -85,17 +86,19 @@ def test_get_snp_count(self): dsnp1, dsnp2, nchr, dindex = get_snp_position(snp_file) bam1 = os.path.join(test_data_dir, "NA12878.bam") - lsnp1, lsnp2 = get_supporting_reads(bam1, dsnp1, dsnp2, nchr, dindex) + bamfile1 = open_alignment_file(bam1) + lsnp1, lsnp2 = get_supporting_reads(bamfile1, dsnp1, dsnp2, nchr, dindex) assert lsnp1 == [0, 0, 0, 0, 0, 0, 29, 35, 26, 39, 29, 35, 32, 37, 39, 39] assert lsnp2 == [0, 0, 0, 0, 0, 0, 12, 39, 39, 32, 26, 55, 45, 33, 42, 18] bam2 = os.path.join(test_data_dir, "NA12885.bam") - lsnp1, lsnp2 = get_supporting_reads(bam2, dsnp1, dsnp2, nchr, dindex) + bamfile2 = open_alignment_file(bam2) + lsnp1, lsnp2 = get_supporting_reads(bamfile2, dsnp1, dsnp2, nchr, dindex) assert lsnp1 == [46, 32, 45, 36, 34, 14, 36, 54, 38, 34, 41, 41, 40, 51, 40, 37] assert lsnp2 == [35, 35, 32, 29, 35, 59, 22, 28, 32, 24, 34, 32, 33, 28, 38, 21] lsnp1, lsnp2, forward, reverse = get_supporting_reads_single_region( - bam2, dsnp1, nchr, dindex + bamfile2, dsnp1, nchr, dindex ) assert lsnp1 == [46, 32, 45, 36, 26, 14, 36, 54, 38, 34, 41, 41, 40, 51, 40, 34] assert lsnp2 == [0, 1, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] @@ -103,9 +106,8 @@ def test_get_snp_count(self): # test indels and reverse complement snp_file = os.path.join(test_data_dir, "SMN_SNP_37_test.txt") dsnp1, dsnp2, nchr, dindex = get_snp_position(snp_file) - bam2 = os.path.join(test_data_dir, "NA12885.bam") lsnp1, lsnp2, forward, reverse = get_supporting_reads_single_region( - bam2, dsnp1, nchr, dindex + bamfile2, dsnp1, nchr, dindex ) assert lsnp1 == [46, 32, 45, 36, 26, 14, 36, 54, 38, 34, 41, 41, 40, 51, 40, 19] assert lsnp2 == [0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16] diff --git a/star_caller.py b/star_caller.py index 449d858..d0e2ee3 100644 --- a/star_caller.py +++ b/star_caller.py @@ -46,15 +46,18 @@ get_normed_depth_from_count, get_read_length, ) -from caller.call_cn import ( +from caller.call_variants import ( NOISY_VAR, call_cn_snp, call_cn_var, call_cn_var_homo, - get_allele_counts_42128936, + get_allele_counts_var42128936, + update_var42128936, get_called_variants, call_exon9gc, call_var42126938, + call_var42127526_var42127556, + call_var42127803hap, ) from caller.cnv_hybrid import get_cnvtag from caller.construct_star_table import get_hap_table @@ -63,8 +66,15 @@ MAD_THRESHOLD = 0.11 EXON9_SITE1 = 7 EXON9_SITE2 = 8 -VAR42126938_SITE = 10 HIGH_CN_DEPTH_THRESHOLD = 7.5 +HAPLOTYPE_VAR = ["g.42126938C>T", "g.42127803C>T", "g.42127526C>T_g.42127556T>C"] +resource_info = namedtuple( + "resource_info", + "genome gmm_parameter region_dic snp_db var_db var_homo_db haplotype_db var_list star_combinations", +) +exon9_values = namedtuple( + "exon9_values", "exon9_cn exon9cn_in_consensus exon9_raw_site1 exon9_raw_site2" +) # Below are the SV configurations that the caller is able to call CNV_ACCEPTED = [ "star5_star5", @@ -117,20 +127,6 @@ def load_parameters(): ) parser.add_argument("-o", "--outDir", help="Output directory", required=True) parser.add_argument("-p", "--prefix", help="Prefix to output file", required=True) - parser.add_argument( - "--knownFunction", - help="Optional, only call star alleles with known functions", - required=False, - default=False, - action="store_true", - ) - parser.add_argument( - "--includeNewStar", - help="Optional, include latest uncurated star alleles", - required=False, - default=False, - action="store_true", - ) parser.add_argument( "-t", "--threads", @@ -233,45 +229,30 @@ def d6_star_caller( # D6/D7 base difference sites. Get read counts at both D6/D7 positions. snp_db = call_parameters.snp_db snp_d6, snp_d7 = get_supporting_reads( - bam, - snp_db.dsnp1, - snp_db.dsnp2, - snp_db.nchr, - snp_db.dindex, - reference=reference_fasta, + bamfile, snp_db.dsnp1, snp_db.dsnp2, snp_db.nchr, snp_db.dindex ) - site42126938 = [snp_d6[VAR42126938_SITE], snp_d7[VAR42126938_SITE]] - snp_d6.pop(VAR42126938_SITE) - snp_d6.pop(VAR42126938_SITE - 1) - snp_d7.pop(VAR42126938_SITE) - snp_d7.pop(VAR42126938_SITE - 1) + # Variants not in homology regions. Get read counts only at D6 positions. var_db = call_parameters.var_db var_alt, var_ref, var_alt_forward, var_alt_reverse = get_supporting_reads_single_region( - bam, var_db.dsnp1, var_db.nchr, var_db.dindex, reference=reference_fasta + bamfile, var_db.dsnp1, var_db.nchr, var_db.dindex ) # Look more carefully for insertions at 42128936 from reads var_list = call_parameters.var_list - ref_read, long_ins_read, short_ins_read = get_allele_counts_42128936( + ref_read, long_ins_read, short_ins_read = get_allele_counts_var42128936( bamfile, call_parameters.genome ) - if "g.42128936-42128937insGGGGCGAAAGGGGCGAAA" in var_list: - long_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAAGGGGCGAAA") - var_alt[long_ins_index] = long_ins_read - var_ref[long_ins_index] = short_ins_read + ref_read - if "g.42128936-42128937insGGGGCGAAA" in var_list: - short_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAA") - var_alt[short_ins_index] = short_ins_read - var_ref[short_ins_index] = long_ins_read + ref_read + var_alt, var_ref = update_var42128936( + var_list, var_alt, var_ref, ref_read, long_ins_read, short_ins_read + ) # Variants in homology regions. Get read counts at both D6/D7 positions. var_homo_db = call_parameters.var_homo_db var_homo_alt, var_homo_ref = get_supporting_reads( - bam, + bamfile, var_homo_db.dsnp1, var_homo_db.dsnp2, var_homo_db.nchr, var_homo_db.dindex, - reference=reference_fasta, ) # This ordered dictionary is for final reporting. raw_count = OrderedDict() @@ -295,7 +276,6 @@ def d6_star_caller( var_homo_ref[i - non_homology_variant_count], ), ) - raw_count.setdefault("g.42126938C>T", "%i,%i" % (site42126938[0], site42126938[1])) # no-call due to total copy number calling if raw_cn_call.d67_cn is None: @@ -369,28 +349,35 @@ def d6_star_caller( cn_call_var = call_cn_var( cnvtag, var_alt, var_ref, var_alt_forward, var_alt_reverse, var_list, var_db ) - # call g.42126938C>T - if cnvtag in ["star5", "cn2"]: - var42126938, G_haplotype = call_var42126938( - bamfile, - cnvtag, - site42126938, - snp_db, - [VAR42126938_SITE - 2, VAR42126938_SITE - 1, VAR42126938_SITE], - ) - else: - var42126938 = [] - G_haplotype = False + # call haplotypes + haplotype_db = call_parameters.haplotype_db + site42126938_count, var42126938, var42126938_G_haplotype = call_var42126938( + bamfile, raw_cn_call.d67_cn, haplotype_db["g.42126938C>T"] + ) + raw_count.setdefault( + "g.42126938C>T", "%i,%i" % (site42126938_count[1], site42126938_count[0]) + ) + + site42127526_count, site42127556_count, var42127526 = call_var42127526_var42127556( + bamfile, cnvtag, haplotype_db["g.42127526C>T_g.42127556T>C"] + ) + raw_count.setdefault( + "g.42127526C>T", "%i,%i" % (site42127526_count[1], site42127526_count[0]) + ) + raw_count.setdefault( + "g.42127556T>C", "%i,%i" % (site42127556_count[1], site42127556_count[0]) + ) + + var42127803_diff_haplotype = call_var42127803hap( + bamfile, cnvtag, haplotype_db["g.42127803C>T"] + ) # 6. Call star allele total_callset = get_called_variants(var_list, cn_call_var) called_var_homo = get_called_variants(var_list, cn_call_var_homo, len(cn_call_var)) total_callset += called_var_homo total_callset += var42126938 - - exon9_values = namedtuple( - "exon9_values", "exon9_cn exon9cn_in_consensus exon9_raw_site1 exon9_raw_site2" - ) + total_callset += var42127526 star_called = match_star( total_callset, @@ -403,21 +390,14 @@ def d6_star_caller( raw_d6_cn[EXON9_SITE1], raw_d6_cn[EXON9_SITE2], ), + var42126938_G_haplotype, + var42127803_diff_haplotype, ) genotype_filter = None # no-call due to star allele matching if "no_match" in star_called[0]: # or star_called[0] == 'more_than_one_match': final_star_allele_call = None - elif ( - star_called[0] == "more_than_one_match" and star_called[-1] == "*1/*32;*27/*41" - ): - genotype_filter = "PASS" - if G_haplotype: - # Variants are on the sample haplotype - final_star_allele_call = "*1/*32" - else: - final_star_allele_call = "*27/*41" else: final_star_allele_call = star_called[-1] if ";" in final_star_allele_call: @@ -452,34 +432,17 @@ def d6_star_caller( return sample_call -def main(): - parameters = load_parameters() - manifest = parameters.manifest - outdir = parameters.outDir +def prepare_resource(datadir, parameters): genome = parameters.genome - prefix = parameters.prefix - reference_fasta = parameters.reference - threads = parameters.threads - path_count_file = parameters.countFilePath - logging.basicConfig(level=logging.DEBUG) - - # Prepare data files - datadir = os.path.join(os.path.dirname(__file__), "data") region_file = os.path.join(datadir, "CYP2D6_region_%s.bed" % genome) snp_file = os.path.join(datadir, "CYP2D6_SNP_%s.txt" % genome) gmm_file = os.path.join(datadir, "CYP2D6_gmm.txt") - table_path = "full_star_table" - if parameters.knownFunction: - table_path = "known_function_star_table" - if parameters.includeNewStar: - table_path = "include_new_star_table" - star_table = os.path.join(datadir, table_path, "star_table.txt") - variant_file = os.path.join( - datadir, table_path, "CYP2D6_target_variant_%s.txt" % genome - ) + star_table = os.path.join(datadir, "star_table.txt") + variant_file = os.path.join(datadir, "CYP2D6_target_variant_%s.txt" % genome) variant_homology_file = os.path.join( - datadir, table_path, "CYP2D6_target_variant_homology_region_%s.txt" % genome + datadir, "CYP2D6_target_variant_homology_region_%s.txt" % genome ) + haplotype_file = os.path.join(datadir, "CYP2D6_haplotype_%s.txt" % genome) star_combinations = get_hap_table(star_table) for required_file in [ @@ -487,17 +450,18 @@ def main(): snp_file, variant_file, variant_homology_file, + haplotype_file, gmm_file, ]: if os.path.exists(required_file) == 0: raise Exception("File %s not found." % required_file) - if os.path.exists(outdir) == 0: - os.makedirs(outdir) - snp_db = get_snp_position(snp_file) var_db = get_snp_position(variant_file) var_homo_db = get_snp_position(variant_homology_file) + haplotype_db = {} + for variant in HAPLOTYPE_VAR: + haplotype_db.setdefault(variant, get_snp_position(haplotype_file, variant)) var_list = [] with open(variant_file) as f: for line in f: @@ -511,10 +475,6 @@ def main(): var_list.append(var_name) gmm_parameter = parse_gmm_file(gmm_file) region_dic = parse_region_file(region_file) - resource_info = namedtuple( - "resource_info", - "genome gmm_parameter region_dic snp_db var_db var_homo_db var_list star_combinations", - ) call_parameters = resource_info( genome, gmm_parameter, @@ -522,9 +482,29 @@ def main(): snp_db, var_db, var_homo_db, + haplotype_db, var_list, star_combinations, ) + return call_parameters + + +def main(): + parameters = load_parameters() + manifest = parameters.manifest + outdir = parameters.outDir + prefix = parameters.prefix + reference_fasta = parameters.reference + threads = parameters.threads + path_count_file = parameters.countFilePath + logging.basicConfig(level=logging.DEBUG) + + if os.path.exists(outdir) == 0: + os.makedirs(outdir) + + # Prepare data files + datadir = os.path.join(os.path.dirname(__file__), "data") + call_parameters = prepare_resource(datadir, parameters) out_json = os.path.join(outdir, prefix + ".json") out_tsv = os.path.join(outdir, prefix + ".tsv")