From a2792322a38dcef357bd8be553305af403cfe89d Mon Sep 17 00:00:00 2001 From: Ruth Eberhardt Date: Wed, 16 Aug 2023 13:02:13 +0100 Subject: [PATCH 1/3] add counts for poss contamination and for poss error --- ...ther_donors_find_best_donor_read_counts.py | 878 ++++++++++++++++++ 1 file changed, 878 insertions(+) create mode 100755 bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py diff --git a/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py new file mode 100755 index 00000000..9700f255 --- /dev/null +++ b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py @@ -0,0 +1,878 @@ +#!/usr/bin/env python3 + +#take cellSNP VCF and genotype VCF for the donors in a pool +# for each cell in the cellSNP VCF identify discordant sites (using the relaxed concordance) +# look for these sites in genotypes of all members of the pool +# output: +# cell id +# assigned donor +# cohort of assigned donor +# number of discordant sites +# total AD over discordant sites +# list of donors in the pool, how many of the discordant sites are found in the donor, cohort each belongs to +# list of discordant sites + +__date__ = '2023-07-24' +__version__ = '0.0.1' +import argparse +import pickle +import pandas as pd +import random +import numpy as np +import multiprocessing as mp +from multiprocessing import Lock +import os +import gzip +import time + + +class VCF_Loader: + + def __init__(self, vcf_file, exclude_a2g, biallelic_only=True, + sparse=False, format_list=['GT']): + self.vcf_file = vcf_file + self.exclude_a2g = exclude_a2g + self.load_sample = True + self.biallelic_only = biallelic_only + self.sparse = sparse + self.record_dict={} + self.reset() + self.format_list = format_list + self.exclusive_donor_variants = {} + self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update + self.last_count=-1 + self.reset_c() + + def reset_c(self): + self.record_times=0 + + def reset(self): + self.exclusive_donor_variants ={} + + def myfunc(self): + print(f"Hello my name is {self.biallelic_only}" ) + + def load_sample_mp(self,line,obs_ids,count,format_list): + ''' + takes VCF lines and extracts all format fields for those where GT !='.' + ''' + list_val = line.rstrip().split("\t") #[:5] #:8 + idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called + if len(list_val[3]) > 1 or len(list_val[4]) > 1: + # CURRENTLY DEALS ONLY WITH BIALELIC + print(f'{idx} var not bialelic') + elif list_val[3] == 'A' and list_val[4] == 'G' and self.exclude_a2g:#remove A>G + pass + elif list_val[3] == 'T' and list_val[4] == 'C' and self.exclude_a2g:#also remove T>C + pass + else: + list_val2 = list_val[9:] + obs = pd.DataFrame(obs_ids) + lv = pd.DataFrame(list_val2) + lv_proc =lv[0].str.split(':').str[idx] + gt_exists = lv_proc[lv_proc != '.'] + idx2 = gt_exists.index + obs_with_gt = obs.loc[idx2.values] + obs_with_gt = list(obs_with_gt[0].values) + list_val_with_gt = lv.loc[idx2.values] + list_val_with_gt = list(list_val_with_gt[0].values) + random.seed(count) + c = list(zip(obs_with_gt, list_val_with_gt)) + random.shuffle(c) + obs_with_gt, list_val_with_gt = zip(*c) + # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count]) + + return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + + def append_results(self,result): + # exclusive_donor_variants + obs_with_gt= result[0] + list_val_with_gt= result[1] + idx = result[2] + list_val = result[3] + count = result[4] + format_list = result[5]#list of required format fields + #get indexes of required format fields (apart from GT which has already been taken care of) + additional_field_idxs = [] + for fmt in format_list: + if not fmt == 'GT': + idx_addn = find(list_val[8].split(':'), fmt)[0] + additional_field_idxs.append(idx_addn) + # print(additional_field_idxs) + # exit(0) + + count11=0 + # r = random.random() + # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer. + # every 500 itterations we push the data to a dictionary, later we combine these together. + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.exclusive_donor_variants,count) + self.reset() + self.reset_c() + + for ob_id in obs_with_gt: + donor_loc_in_list = count11 + alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx] + #append any additional format fields to alleles + if len(additional_field_idxs) > 0: + for idx_addnl in additional_field_idxs: + fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl] + alleles = alleles + '_' + fmt_val + + if not alleles.startswith('.'): + ids = "_".join([list_val[x] for x in [0, 1, 3, 4]]) + donor_var = f"{ids}_{alleles}" + while ob_id in self.curently_pushing: + time.sleep(r*0.01) + self.curently_pushing.append(ob_id) + try: + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + except: + self.exclusive_donor_variants[ob_id]=set() + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + self.curently_pushing.remove(ob_id) + # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1'] + count11+=1 + + def combine_written_files(self):#this is for VCF loader class + to_export = self.exclusive_donor_variants + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + try: + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + except: + to_export[k1]=set() + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + os.remove(val1) + return to_export + + + def load_VCF_batch_paralel(self): + """ + Load whole VCF file by utilising multiple cores to speed up loading of large cell files + ------------------- + Initially designed to load VCF from cellSNP output, requiring + 1) all variants have the same format list; + 2) a line starting with "#CHROM", with sample ids. + If these two requirements are satisfied, this function also supports general + VCF files, e.g., genotype for multiple samples. + + Note, it may take a large memory, please filter the VCF with bcftools first. + """ + + vcf_file = self.vcf_file + biallelic_only = self.biallelic_only + load_sample= self.load_sample + sparse = self.sparse + format_list= self.format_list + pool = mp.Pool(cpus) + + + import time + if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz": + infile = gzip.open(vcf_file, "rb") + is_gzip = True + else: + infile = open(vcf_file, "r") + is_gzip = False + + FixedINFO = {} + contig_lines = [] + comment_lines = [] + var_ids, obs_ids, obs_dat = [], [], [] + count=0 #57077 + for line in infile: + count+=1 + # if count>10000: + # break + if is_gzip: + line = line.decode('utf-8') + if line.startswith("#"): + if line.startswith("##contig="): + contig_lines.append(line.rstrip()) + if line.startswith("#CHROM"): + if load_sample: + obs_ids = line.rstrip().split("\t")[9:] + for ob_id in obs_ids: + self.exclusive_donor_variants[ob_id]=set() + key_ids = line[1:].rstrip().split("\t")[:8] + for _key in key_ids: + FixedINFO[_key] = [] + else: + comment_lines.append(line.rstrip()) + else: + pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results) + del line + self.last_count=count + pool.close() + pool.join() + + output = self.combine_written_files() + return output + + +class Concordances: + def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites): + self.reset() + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites + self.record_dict={} + + + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + + def reset(self): + self.cell_concordance_table ={} + + + def read_concordance(self, expected_vars, cell_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) + + # print(expected_vars) + # print(cell_vars) + # exit(0) + + discordant_site_details = { + 'expect_hom_alt':{}, + 'expect_hom_ref':{}, + 'expect_het':{} + } + + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + cell_vars2_disc = cell_vars2[(cell_vars2['AD'] > 0) | (cell_vars2['OTH'] > 0)] + cell_vars2_disc_sites = set(cell_vars2_disc['ids']) + for s2 in cell_vars2_disc_sites: + sdf = cell_vars2_disc[cell_vars2_disc['ids'] == s2] + if not len(sdf) == 1: + print('error, 1 variant expected') + print(sdf) + discordant_site_details['expect_hom_ref'][s2] = {} + discordant_site_details['expect_hom_ref'][s2]['AD'] = sdf['AD'].sum() + discordant_site_details['expect_hom_ref'][s2]['found_in_other_donor'] = 'no' + # print(cell_vars2_disc) + # print(cell_vars2) + # exit(0) + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + # cell_vars3_disc = cell_vars3[cell_vars3['OTH'] > 0] + # cell_vars3_disc_sites = set(cell_vars3_disc['ids']) + # for s3 in cell_vars3_disc_sites: + # sdf = cell_vars3_disc[cell_vars3_disc['ids'] == s3] + # if not len(sdf) == 1: + # print('error, 1 variant expected') + # print(sdf) + # discordant_site_details['expect_het'][s3] = {} + # discordant_site_details['expect_het'][s3]['OTH'] = sdf['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + cell_vars4['non_ref_reads'] = cell_vars4['AD'] = cell_vars4['OTH'] + cell_vars4['ref_reads'] = cell_vars4['DP'] - cell_vars4['non_ref_reads'] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + cell_vars4_disc = cell_vars4[(cell_vars4['ref_reads'] > 0) | (cell_vars4['OTH'] > 0)] + cell_vars4_disc_sites = set(cell_vars4_disc['ids']) + for s4 in cell_vars4_disc_sites: + sdf = cell_vars4_disc[cell_vars4_disc['ids'] == s4] + if not len(sdf) == 1: + print('error, 1 variant expected') + print(sdf) + discordant_site_details['expect_hom_alt'][s4] = {} + discordant_site_details['expect_hom_alt'][s4]['REF'] = sdf['ref_reads'].sum() + discordant_site_details['expect_hom_alt'][s4]['found_in_other_donor'] = 'no' + + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + + return total_sites, total_reads, discordant_reads, discordant_site_details + + + def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites and list of discordant sites1 + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + true_discordant = 0 + relaxed_concordant = 0 + discordant_vars = [] + concordant_vars = [] + for i in range(0, len(snp_gtypes)): + discordant = False + snp_data = snp_gtypes[i].split('_') + cellsnp_data = cellsnp_gtypes[i].split('_') + + # the below will no longer work due to differing length of input strings + # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] + # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] + + + snp_alleles = [snp_data[4][0], snp_data[4][2]] + cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] + + snp_alleles_set = set(snp_alleles) + cellsnp_alleles_set = set(cellsnp_alleles) + + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) + + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) + exit(1) + else: + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + # print(cellsnp_var) + # print(cellsnp_data) + # exit(0) + true_discordant+=1 + discordant_vars.append(cellsnp_var) + else: + concordant_vars.append(cellsnp_var) + relaxed_concordant+=1 + + return true_discordant, relaxed_concordant, discordant_vars,concordant_vars + + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + cell_vars_norm = self.norm_genotypes(cell_vars) + if len(cell_vars_norm) > 0: + Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)] + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + #find sites that may be discordant - this will include hets which are not truly discordant + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + #disc_sites = ';'.join(disc2['expected_retrieved']) + true_discordant_count, relaxed_concordant_count, discordant_vars, concordant_vars = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) + total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count + #find discordant reads + total_sites, total_reads, discordant_reads, discordant_site_details = self.read_concordance(expected_vars2, cell_vars2) + + return total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_site_details, discordant_vars, concordant_vars + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + def append_results_cell_concordances(self,result,cell_concordance_table): + #[cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, + # discordant_reads, discordant_vars,discordant_vars_in_pool_str, count] + count=result[10] + + try: + percent_discordant = result[4]/(result[4]+result[5])*100 + except: + percent_discordant = 0 + + try: + read_discordance = result[7]/result[3] + except: + read_discordance = 0 + + # print(count) + same_as_asigned_donor = result[12]==result[1] + cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], + 'GT 2':result[1], + 'Cohort': result[2], + 'Nr_Concordant':result[5], + 'Nr_Discordant':result[4], + 'Percent_Discordant':percent_discordant, + 'Total_sites': result[3], + 'Total_reads': result[6], + 'Discordant_reads': result[7], + 'Discordant_reads_by_n_sites': read_discordance, + 'Discordant_sites_in_pool': result[9], + 'Discordant_Site_Identities':(';').join(result[8]), + 'Lowest_Disconcordance_value_in_all_donors':result[11], + 'Donor_With_Lowest_DisConcordance':result[12], + 'Concordant_Site_Identities':result[13], + 'same_as_asigned_donor':same_as_asigned_donor + } + + # if (count % 200 == 0): + # print(f'recording and resetting memory {count}') + # # self.record_dict[count]=self.exclusive_donor_variants + # self.set_results(self.cell_concordance_table,count) + # self.reset() + # _="" + return cell_concordance_table + + def combine_written_files(self):#this one is for concordance class + to_export = self.cell_concordance_table + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + to_export[k1]=loaded_dict[k1] + os.remove(val1) + return to_export + + def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm): + donor_concordance_table = {} + for cell1 in Cells_to_keep_pre: + count+=1 + # if count>10: + # break + cell_vars = exclusive_cell_variants[cell1] + # cell_vars_dp = exclusive_cell_variants_dp[cell1] + + # self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} + # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) + result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data) + # if (result1==None): + # _='test' + donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table) + # print('Done') + return donor_concordance_table + + def combine_concordances(self,result): + # print('res') + self.cell_concordance_table = self.cell_concordance_table | result + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants + donor_list = exclusive_don_variants.keys() + + pool = mp.Pool(cpus) + count = 0 + + #create a list of variants that are on each donor genotype file + vars_per_donor_gt = {} + for don_id in donor_list: + donor_gt_vars = list(exclusive_don_variants[don_id]) + donor_gt_vars = pd.DataFrame(donor_gt_vars) + donor_gt_vars = self.norm_genotypes(donor_gt_vars) + #donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0'] ref/ref now wanted too + donor_gt_varids = list(donor_gt_vars['ids']) + vars_per_donor_gt[don_id] = donor_gt_varids + #work out what cohort each donor belongs to + donor_cohorts = {} + for don_id in donor_list: + cohort = 'UNKNOWN' + donor_split = don_id.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + donor_cohorts[don_id] = cohort + + all_donor_data={} + # here we calvculate all the expected donor datasets + for row1 in exclusive_don_variants.keys(): + # donor_in_question = row1['donor_query'] + donor_gt_match = row1 + expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match] + expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) + all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor + # print(all_donor_data) + # exit(0) + + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + if (donor_gt_match=='NONE'): + continue + + donor_gt_match_cohort = donor_cohorts[donor_gt_match] + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) + expected_vars = exclusive_don_variants[donor_gt_match] + expected_vars_norm = self.norm_genotypes(expected_vars) + try: + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] + except: + continue + if cpus==1: + result_conc = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm) + self.combine_concordances(result_conc) + else: + pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances) + + pool.close() + pool.join() + output = self.combine_written_files() + return output + + + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data): + #Nr_donor_distinct_sites = len(dds) + total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_site_details, discordant_vars, concordant_vars = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + #Nr_Concordant = len(Concordant_Sites) + #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count + #Nr_Discordant = len(Discordant_sites) + #Nr_Total_Overlapping_sites = len(Total_Overlappin_sites) + #Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) + #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + #find if the discordant vars are in any of the other donors + discordant_vars_in_pool = [] + donor_table_of_concordances = [] + # print(discordant_site_details) + # exit(0) + for donor in vars_per_donor_gt: + # if not donor == donor_gt_match: + # donor_cohort = donor_cohorts[donor] + # donor_vars = vars_per_donor_gt[donor] + # common_vars = list(set(discordant_vars) & set(donor_vars)) + # common_var_count = str(len(common_vars)) + # print(common_vars) + # print(discordant_site_details) + # exit(0) + #deal with expected hom alt + #deal with expected hom ref + #deal with expected het: discordant = OTH + # donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count + # discordant_vars_in_pool.append(donor_cohort_common) + + # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment. + # We do this to investigate the potential of a cell coming from this other donor. + + expected_vars_norm_of_other_donor = all_donor_data[donor] + total_sites_otherDonor, true_discordant_count_otherDonor, total_concordant_sites_otherDonor, total_reads_otherDonor, discordant_reads_otherDonor, discordant_site_detail_otherDonor, discordant_vars_otherDonor, concordant_vars_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars) + concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100 + discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100 + donor_table_of_concordances.append({'donor':donor,'concordant_percent_in_other_donor':concordant_percent_in_other_donor,'discordant_percent_in_other_donor':discordant_percent_in_other_donor}) + + if not donor == donor_gt_match: + print(donor) + #print(expected_vars_norm_of_other_donor) + donor_cohort = donor_cohorts[donor] + donor_vars = vars_per_donor_gt[donor] + common_vars = list(set(discordant_vars) & set(donor_vars)) + #common_var_count = str(len(common_vars)) + common_var_count = 0 + ####deal with expected hom alt + print('looking at home alts') + expected_hom_alts = list(discordant_site_details['expect_hom_alt'].keys()) + # print(expected_hom_alts) + # exit(0) + mask = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_alts) + expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[mask] + #expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'] in expected_hom_alts] + # print(expected_hom_alts_in_other_donors) + #can any other donors contribute a ref read? + refgts = ['0/0', '0/1', '1/0'] + mask2 = expected_hom_alts_in_other_donors['vars'].isin(refgts) + expected_hom_alts_in_other_donors_has_ref = expected_hom_alts_in_other_donors[mask2] + if (len(expected_hom_alts_in_other_donors_has_ref) > 0): + has_ref_vars = list(expected_hom_alts_in_other_donors_has_ref['ids']) + for v in has_ref_vars: + discordant_site_details['expect_hom_alt'][v]['found_in_other_donor'] = 'yes' + common_var_count += 1 + ####deal with expected hom ref + print('looking at hom refs') + expected_hom_refs = list(discordant_site_details['expect_hom_ref'].keys()) + # print(expected_hom_refs) + mask3 = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_refs) + expected_hom_refs_in_other_donors = expected_vars_norm_of_other_donor[mask3] + # print(expected_hom_refs_in_other_donors) + #can any other donors contribute and alt read? + altgts = ['0/1', '1/0', '1/1'] + mask4 = expected_hom_refs_in_other_donors['vars'].isin(altgts) + expected_hom_refs_in_other_donors_has_alt = expected_hom_refs_in_other_donors[mask4] + print(expected_hom_refs_in_other_donors_has_alt) + if (len(expected_hom_refs_in_other_donors_has_alt) > 0): + has_alt_vars = list(expected_hom_refs_in_other_donors_has_alt['ids']) + # print(has_alt_vars) + for v in has_alt_vars: + discordant_site_details['expect_hom_ref'][v]['found_in_other_donor'] = 'yes' + common_var_count += 1 + # print(common_var_count) + ###deal with expected het: discordant = OTH + #skip hets for now + print('one') + print(donor) + print(common_var_count) + print(donor_cohort) + donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count + print(donor_cohort_common) + print('two') + discordant_vars_in_pool.append(donor_cohort_common) + print('three') + #work out how many ref/alt reads could be contamination and how many must be errors + print(discordant_site_details) + discordant_ref_expected_error = 0 + discordant_alt_expected_error = 0 + discordant_ref_expected_contamination = 0 + discordant_alt_expected_contamination = 0 + for v1 in discordant_site_details['expect_hom_alt'].keys(): + if discordant_site_details['expect_hom_alt'][v1]['found_in_other_donor'] == 'no': + discordant_alt_expected_error = discordant_alt_expected_error + discordant_site_details['expect_hom_alt'][v1]['REF'] + else: + discordant_alt_expected_contamination = discordant_alt_expected_contamination + discordant_site_details['expect_hom_alt'][v1]['REF'] + + for v2 in discordant_site_details['expect_hom_ref'].keys(): + if discordant_site_details['expect_hom_ref'][v2]['found_in_other_donor'] == 'no': + discordant_ref_expected_error = discordant_ref_expected_error + discordant_site_details['expect_hom_ref'][v2]['REF'] + else: + discordant_ref_expected_contamination = discordant_ref_expected_contamination + discordant_site_details['expect_hom_ref'][v2]['REF'] + + + discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) + concordant_vars_in_pool_str = (";").join(concordant_vars) + + #print(discordant_vars_in_pool_str) + print(discordant_ref_expected_error) + print(discordant_alt_expected_error) + print(discordant_ref_expected_contamination) + print(discordant_alt_expected_contamination) + print(discordant_site_details) + exit(0) + + DF = pd.DataFrame(donor_table_of_concordances) + Donor_With_Lowest_DisConcordance = DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values[0] + Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] + + return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str] + #return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, + # Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads] + + +def find(lst, a): + return [i for i, x in enumerate(lst) if x==a ] + + +def norm_genotypes(expected_vars): + expected_vars = pd.DataFrame(expected_vars) + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + +def donor_exclusive_sites(exclusive_don_variants2): + # Here we generate a function for determining the sites that are donor exclusive + donor_distinct_sites = {} + for col1 in exclusive_don_variants2.keys(): + comparisons =[] + to_compare = [] + for col2 in exclusive_don_variants2.keys(): + if col1==col2: + # we set this as the unique entry + # print('1') + to_compare = set(exclusive_don_variants2[col2]) + else: + # We combine all the variants in one list + comparisons+=list(exclusive_don_variants2[col2]) + # print('2') + # print('comparison') + comparisons_all = set(comparisons) + comparisons_all_norm = norm_genotypes(comparisons_all) + comparisons_all=set(comparisons_all_norm['combo']) + + to_compare = set(to_compare) + to_compare_norm = norm_genotypes(to_compare) + to_compare=set(to_compare_norm['combo']) + # Make sure we account for hap types - phased/unphased + distinct_donor_sites = to_compare - comparisons_all + donor_distinct_sites[col1]=distinct_donor_sites + # Perform the distinct set function. + return donor_distinct_sites + + +def get_options(): + ''' + Get options from the command line + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--cpus', action='store', required=True, type=int) + parser.add_argument('--cell_vcf', action='store', required=True) + parser.add_argument('--cell_assignments', action='store', required=True) + parser.add_argument('--donor_assignments', action='store', required=True) + parser.add_argument('--gt_match_vcf', action='store', required=True) + parser.add_argument('--expected_vcf', action='store', required=True) + parser.add_argument('--outfile', action='store', required=True) + parser.add_argument('--exclude_a2g', action='store_true') + parser.add_argument('--debug', action='store_true') + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + + options = get_options() + cpus = options.cpus + cell_vcf=options.cell_vcf + donor_assignments=options.donor_assignments + gt_match_vcf=options.gt_match_vcf + expected_vcf=options.expected_vcf + cell_assignments=options.cell_assignments + outfile = options.outfile + exclude_a2g = options.exclude_a2g + + exclusive_donor_variants = {} #This is where results are populated when mp process i used. + curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update + All_Results={} + cell_concordance_table = {} + + donor_assignments_table = pd.read_csv(donor_assignments) + cell_assignments_table = pd.read_csv(cell_assignments,sep='\t') + + if options.debug: + with open('tmp_GT_Expected_variants.pkl', 'rb') as f: + GT_Expected_variants = pickle.load(f) + with open('tmp_GT_Matched_variants.pkl', 'rb') as f: + GT_Matched_variants = pickle.load(f) + with open('tmp_exclusive_cell_variants.pkl', 'rb') as f: + exclusive_cell_variants = pickle.load(f) + with open('tmp_donor_distinct_sites.pkl', 'rb') as f: + donor_distinct_sites = pickle.load(f) + with open('tmp_exclusive_don_variants.pkl', 'rb') as f: + exclusive_don_variants = pickle.load(f) + else: + print('---Loading genotype VCF----') + if (os.path.exists(gt_match_vcf)): + loader2 = VCF_Loader(gt_match_vcf, exclude_a2g, biallelic_only=True, + sparse=False, format_list=['GT']) + GT_Matched_variants = loader2.load_VCF_batch_paralel() + del loader2 + else: + GT_Matched_variants = {} + + with open(f'tmp_GT_Matched_variants.pkl', 'wb') as f: + pickle.dump(GT_Matched_variants, f) + + print('---Loading cell VCF----') + tic = time.perf_counter() + loader1 = VCF_Loader(cell_vcf, exclude_a2g, biallelic_only=True, + sparse=False, format_list=['GT', 'DP', 'AD', 'OTH']) + exclusive_cell_variants = loader1.load_VCF_batch_paralel() + del loader1 + toc = time.perf_counter() + + with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f: + pickle.dump(exclusive_cell_variants, f) + print(f"Loading took {toc - tic:0.4f} seconds") + + print('---Loading expected VCF----') + loader3 = VCF_Loader(expected_vcf, exclude_a2g, biallelic_only=True, + sparse=False, format_list=['GT']) + GT_Expected_variants = loader3.load_VCF_batch_paralel() + del loader3 + + with open(f'tmp_GT_Expected_variants.pkl', 'wb') as f: + pickle.dump(GT_Expected_variants, f) + + print('---Variant files loaded----') + + exclusive_don_variants = GT_Expected_variants.keys() + content = [x for x in exclusive_don_variants if not x.startswith('donor')] + GT_Expected_variants = {key: GT_Expected_variants[key] for key in content} + + exclusive_don_variants = GT_Matched_variants.keys() + content = [x for x in exclusive_don_variants if not x.startswith('donor')] + GT_Matched_variants = {key: GT_Matched_variants[key] for key in content} + + exclusive_don_variants = GT_Expected_variants + for key in GT_Matched_variants.keys(): + if key in exclusive_don_variants.keys(): + _='' + else: + exclusive_don_variants[key]=GT_Matched_variants[key] + + with open(f'tmp_exclusive_don_variants.pkl', 'wb') as f: + pickle.dump(exclusive_don_variants, f) + donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants) + with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f: + pickle.dump(donor_distinct_sites, f) + + print('---donor_distinct_sites calculated----') + + + conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites) + cell_concordance_table = conc1.conc_table() + + result = pd.DataFrame(cell_concordance_table).T + result.to_csv(outfile,sep='\t') + site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']] + result.drop(columns=['Concordant_Site_Identities'],inplace=True) + result.to_csv(outfile,sep='\t') + site_identities.to_csv(f"site_identities_{outfile}",sep='\t') + print('Processing Done') \ No newline at end of file From 22cd7cda2aa2f1778c987015a0235f40b1d8a3de Mon Sep 17 00:00:00 2001 From: Ruth Eberhardt Date: Thu, 17 Aug 2023 13:37:29 +0100 Subject: [PATCH 2/3] read counts - error vs contamination --- ...ther_donors_find_best_donor_read_counts.py | 50 ++++++------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py index 9700f255..34764c9a 100755 --- a/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py +++ b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py @@ -302,6 +302,7 @@ def read_concordance(self, expected_vars, cell_vars): print(sdf) discordant_site_details['expect_hom_ref'][s2] = {} discordant_site_details['expect_hom_ref'][s2]['AD'] = sdf['AD'].sum() + discordant_site_details['expect_hom_ref'][s2]['OTH'] = sdf['OTH'].sum() discordant_site_details['expect_hom_ref'][s2]['found_in_other_donor'] = 'no' # print(cell_vars2_disc) # print(cell_vars2) @@ -343,6 +344,7 @@ def read_concordance(self, expected_vars, cell_vars): print(sdf) discordant_site_details['expect_hom_alt'][s4] = {} discordant_site_details['expect_hom_alt'][s4]['REF'] = sdf['ref_reads'].sum() + discordant_site_details['expect_hom_alt'][s4]['OTH'] = sdf['OTH'].sum() discordant_site_details['expect_hom_alt'][s4]['found_in_other_donor'] = 'no' discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt @@ -434,6 +436,7 @@ def set_results(self,to_set,id): def append_results_cell_concordances(self,result,cell_concordance_table): #[cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, # discordant_reads, discordant_vars,discordant_vars_in_pool_str, count] + # print(result) count=result[10] try: @@ -463,7 +466,11 @@ def append_results_cell_concordances(self,result,cell_concordance_table): 'Lowest_Disconcordance_value_in_all_donors':result[11], 'Donor_With_Lowest_DisConcordance':result[12], 'Concordant_Site_Identities':result[13], - 'same_as_asigned_donor':same_as_asigned_donor + 'same_as_asigned_donor':same_as_asigned_donor, + 'discordant_reads_error_expected_ref':result[14], + 'discordant_reads_error_expected_alt':result[15], + 'discordant_reads_contamination_expected_ref':result[16], + 'discordant_reads_contamination_expected_alt':result[17] } # if (count % 200 == 0): @@ -615,22 +622,17 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g donor_table_of_concordances.append({'donor':donor,'concordant_percent_in_other_donor':concordant_percent_in_other_donor,'discordant_percent_in_other_donor':discordant_percent_in_other_donor}) if not donor == donor_gt_match: - print(donor) - #print(expected_vars_norm_of_other_donor) donor_cohort = donor_cohorts[donor] donor_vars = vars_per_donor_gt[donor] common_vars = list(set(discordant_vars) & set(donor_vars)) #common_var_count = str(len(common_vars)) common_var_count = 0 ####deal with expected hom alt - print('looking at home alts') + # print('looking at home alts') expected_hom_alts = list(discordant_site_details['expect_hom_alt'].keys()) # print(expected_hom_alts) - # exit(0) mask = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_alts) expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[mask] - #expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'] in expected_hom_alts] - # print(expected_hom_alts_in_other_donors) #can any other donors contribute a ref read? refgts = ['0/0', '0/1', '1/0'] mask2 = expected_hom_alts_in_other_donors['vars'].isin(refgts) @@ -641,37 +643,24 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g discordant_site_details['expect_hom_alt'][v]['found_in_other_donor'] = 'yes' common_var_count += 1 ####deal with expected hom ref - print('looking at hom refs') + # print('looking at hom refs') expected_hom_refs = list(discordant_site_details['expect_hom_ref'].keys()) - # print(expected_hom_refs) mask3 = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_refs) expected_hom_refs_in_other_donors = expected_vars_norm_of_other_donor[mask3] - # print(expected_hom_refs_in_other_donors) #can any other donors contribute and alt read? altgts = ['0/1', '1/0', '1/1'] mask4 = expected_hom_refs_in_other_donors['vars'].isin(altgts) expected_hom_refs_in_other_donors_has_alt = expected_hom_refs_in_other_donors[mask4] - print(expected_hom_refs_in_other_donors_has_alt) if (len(expected_hom_refs_in_other_donors_has_alt) > 0): has_alt_vars = list(expected_hom_refs_in_other_donors_has_alt['ids']) - # print(has_alt_vars) for v in has_alt_vars: discordant_site_details['expect_hom_ref'][v]['found_in_other_donor'] = 'yes' common_var_count += 1 - # print(common_var_count) - ###deal with expected het: discordant = OTH + ###deal with expected het: discordant = OTH - leaving this for nnow #skip hets for now - print('one') - print(donor) - print(common_var_count) - print(donor_cohort) - donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count - print(donor_cohort_common) - print('two') + donor_cohort_common = (':').join([donor, donor_cohort, str(common_var_count)]) discordant_vars_in_pool.append(donor_cohort_common) - print('three') #work out how many ref/alt reads could be contamination and how many must be errors - print(discordant_site_details) discordant_ref_expected_error = 0 discordant_alt_expected_error = 0 discordant_ref_expected_contamination = 0 @@ -684,27 +673,18 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g for v2 in discordant_site_details['expect_hom_ref'].keys(): if discordant_site_details['expect_hom_ref'][v2]['found_in_other_donor'] == 'no': - discordant_ref_expected_error = discordant_ref_expected_error + discordant_site_details['expect_hom_ref'][v2]['REF'] + discordant_ref_expected_error = discordant_ref_expected_error + discordant_site_details['expect_hom_ref'][v2]['AD'] else: - discordant_ref_expected_contamination = discordant_ref_expected_contamination + discordant_site_details['expect_hom_ref'][v2]['REF'] - + discordant_ref_expected_contamination = discordant_ref_expected_contamination + discordant_site_details['expect_hom_ref'][v2]['AD'] discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) concordant_vars_in_pool_str = (";").join(concordant_vars) - #print(discordant_vars_in_pool_str) - print(discordant_ref_expected_error) - print(discordant_alt_expected_error) - print(discordant_ref_expected_contamination) - print(discordant_alt_expected_contamination) - print(discordant_site_details) - exit(0) - DF = pd.DataFrame(donor_table_of_concordances) Donor_With_Lowest_DisConcordance = DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values[0] Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] - return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str] + return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str, discordant_ref_expected_error, discordant_alt_expected_error, discordant_ref_expected_contamination, discordant_alt_expected_contamination] #return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, # Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads] From 0dfb0d38c01901d7ffcb971a72569785f120db1e Mon Sep 17 00:00:00 2001 From: Ruth Eberhardt Date: Fri, 29 Sep 2023 09:31:23 +0100 Subject: [PATCH 3/3] new metrics - discordance at informative and uninformative sites --- ...ations_donor_exclusive_read_level_noA2G.py | 123 +++++++++++++----- 1 file changed, 94 insertions(+), 29 deletions(-) diff --git a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py index d711e054..cc9ebc64 100755 --- a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py +++ b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py @@ -72,6 +72,8 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): true_discordant = 0 relaxed_concordant = 0 relaxed_concordant_informative = 0 + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 true_discordant_uninformative = 0 for i in range(0, len(snp_gtypes)): @@ -105,12 +107,16 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): true_discordant+=1 if snp_var in self.uninformative_sites: true_discordant_uninformative+=1 + elif snp_var in self.informative_sites: + true_discordant_informative+=1 else: relaxed_concordant+=1 - if snp_var in self.informative_sites: + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + elif snp_var in self.informative_sites: relaxed_concordant_informative+=1 - return true_discordant, relaxed_concordant, relaxed_concordant_informative, true_discordant_uninformative + return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative def read_condordance(self, expected_vars, cell_vars): @@ -129,38 +135,76 @@ def read_condordance(self, expected_vars, cell_vars): cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + #split to informative and uninformative sites + mask_i = cell_vars['ids'].isin(self.informative_sites) + cell_vars_informative = cell_vars[mask_i] + mask_u = cell_vars['ids'].isin(self.uninformative_sites) + cell_vars_uninformative = cell_vars[mask_u] + informative_sites = len(cell_vars_informative) + uninformative_sites = len(cell_vars_uninformative) + total_dp = cell_vars['DP'].sum() total_oth = cell_vars['OTH'].sum() total_reads = total_dp + total_oth + total_dp_inf = cell_vars_informative['DP'].sum() + total_oth_inf = cell_vars_informative['OTH'].sum() + total_reads_informative = total_dp_inf + total_oth_inf + total_dp_uninf = cell_vars_uninformative['DP'].sum() + total_oth_uninf = cell_vars_uninformative['OTH'].sum() + total_reads_uninformative = total_dp_uninf + total_oth_uninf # expected genotype 0/0 expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] hom_ref_sites = set(expected_hom_ref['ids']) cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)] + cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)] ad_hom_ref = cell_vars2['AD'].sum() oth_hom_ref = cell_vars2['OTH'].sum() discordant_hom_ref = ad_hom_ref + oth_hom_ref + ad_hom_ref_inf = cell_vars_inf_2['AD'].sum() + oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() + discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf + ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum() + oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() + discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf # expected genotype 0/1 or 1/0 hets = ['0/1', '1/0'] expected_het = expected_vars[expected_vars['vars'].isin(hets)] het_sites = set(expected_het['ids']) cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)] + cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)] discordant_het = cell_vars3['OTH'].sum() + discordant_het_informative = cell_vars_inf_3['OTH'].sum() + discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum() # expected genotype 1/1 expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] hom_alt_sites = set(expected_hom_alt['ids']) cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)] + cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)] # DP + OTH - AD ad_hom_alt = cell_vars4['AD'].sum() dp_hom_alt = cell_vars4['DP'].sum() oth_hom_alt = cell_vars4['OTH'].sum() discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + ad_hom_alt_inf = cell_vars_inf_4['AD'].sum() + dp_hom_alt_inf = cell_vars_inf_4['DP'].sum() + oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum() + discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf + ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum() + dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum() + oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum() + discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + discordant_reads_informative = discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative + discordant_reads_uninformative = discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative - return total_sites, total_reads, discordant_reads + return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): @@ -171,28 +215,28 @@ def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): cell_vars_norm = self.norm_genotypes(cell_vars) if len(cell_vars_norm) > 0: - Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)] + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] # print(cell_vars_norm) # print(expected_vars2) # print(cell_vars2) # exit(0) Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) - Discodrant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) - disc = pd.DataFrame(Discodrant_sites,columns=['combo_x']) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] disc_sites = ';'.join(disc2['expected_retrieved']) #find truly discordant sites - true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) #find discordant reads - total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2) + total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative = self.read_condordance(expected_vars2, cell_vars2) else: - Total_Overlappin_sites = set() + Total_Overlapping_sites = set() Concordant_Sites = set() - Discodrant_sites = set() + Discordant_sites = set() disc_sites = '' true_discordant_count = 0 relaxed_concordant_count = 0 @@ -200,7 +244,7 @@ def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): discordant_reads = 0 - return Concordant_Sites, Discodrant_sites, Total_Overlappin_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads + return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative def set_results(self,to_set,id): @@ -210,7 +254,7 @@ def set_results(self,to_set,id): self.record_dict[id]=f'tmp_{id}.pkl' def append_results_cell_concordances(self,result): - count=result[11] + count=result[13] try: percent_concordant = result[2]/(result[3]+result[2])*100 except: @@ -232,13 +276,22 @@ def append_results_cell_concordances(self,result): percent_strict_discordant = 0 try: - read_discordance = result[15]/result[13] + read_discordance = result[21]/result[15] except: read_discordance = 0 + donor = result[1] + cohort = 'UNKNOWN' + donor_split = donor.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + print(count) self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], 'GT 2':result[1], + 'cohort': cohort, 'Nr_Concordant':result[2], 'Nr_Discordant':result[3], 'Nr_Relaxed_concordant':result[4], @@ -248,14 +301,22 @@ def append_results_cell_concordances(self,result): 'Percent_relaxed_concordant': percent_relaxed_concordant, 'Percent_strict_discordant': percent_strict_discordant, 'Nr_concordant_informative': result[6], - 'Nr_discordant_uninformative': result[7], - 'NrTotal_Overlapping_sites_between_two_genotypes':result[8], - 'Nr_donor_distinct_sites_within_pool_individuals':result[10], - 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[9], - 'Discordant_Site_Identities':result[12], - 'Total_sites': result[13], - 'Total_reads': result[14], - 'Discordant_reads': result[15], + 'Nr_concordant_uninformative': result[7], + 'Nr_discordant_informative': result[8], + 'Nr_discordant_uninformative': result[9], + 'NrTotal_Overlapping_sites_between_two_genotypes':result[10], + 'Nr_donor_distinct_sites_within_pool_individuals':result[12], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11], + 'Discordant_Site_Identities':result[14], + 'Total_sites': result[15], + 'Total_informative_sites': result[16], + 'Total_uninformative_sites': result[17], + 'Total_reads': result[18], + 'Total_reads_informative': result[19], + 'Total_reads_uninformative': result[20], + 'Discordant_reads': result[21], + 'Discordant_reads_informtive': result[22], + 'Discordant_reads_uninformtive': result[23], 'Discordant_reads_by_n_sites': read_discordance } @@ -320,16 +381,18 @@ def conc_table(self): def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): Nr_donor_distinct_sites = len(dds) - Concordant_Sites, Discodrant_sites, Total_Overlappin_sites,discordant_sites,cell_vars_norm, Nr_strict_discordant, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) Nr_Concordant = len(Concordant_Sites) Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count - Nr_Discordant = len(Discodrant_sites) - Nr_Total_Overlapping_sites = len(Total_Overlappin_sites) + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) - return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, - Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads] + return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, true_discordant_count, relaxed_concordant_informative_count, + relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, + Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, + uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative] class VCF_Loader: @@ -549,6 +612,7 @@ def get_options(): parser.add_argument('--expected_vcf', action='store', required=True) parser.add_argument('--informative_sites', action='store', required=True) parser.add_argument('--uninformative_sites', action='store', required=True) + parser.add_argument('--outfile', action='store', required=True) args = parser.parse_args() return args @@ -621,6 +685,7 @@ def donor_exclusive_sites(exclusive_don_variants2): options = get_options() cpus = options.cpus + outfile = options.outfile cell_vcf=options.cell_vcf donor_assignments=options.donor_assignments gt_match_vcf=options.gt_match_vcf @@ -715,6 +780,6 @@ def donor_exclusive_sites(exclusive_don_variants2): # cell_concordance_table = conc_table(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants) result = pd.DataFrame(cell_concordance_table).T - result.to_csv('cell_concordance_table_noA2G.tsv',sep='\t') + result.to_csv(outfile,sep='\t') print('Processing Done')