From a2792322a38dcef357bd8be553305af403cfe89d Mon Sep 17 00:00:00 2001
From: Ruth Eberhardt <re3@sanger.ac.uk>
Date: Wed, 16 Aug 2023 13:02:13 +0100
Subject: [PATCH 1/3] add counts for poss contamination and for poss error

---
 ...ther_donors_find_best_donor_read_counts.py | 878 ++++++++++++++++++
 1 file changed, 878 insertions(+)
 create mode 100755 bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py

diff --git a/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py
new file mode 100755
index 00000000..9700f255
--- /dev/null
+++ b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py
@@ -0,0 +1,878 @@
+#!/usr/bin/env python3
+
+#take cellSNP VCF and genotype VCF for the donors in a pool
+# for each cell in the cellSNP VCF identify discordant sites (using the relaxed concordance)
+# look for these sites in genotypes of all members of the pool
+# output:
+#   cell id
+#   assigned donor
+#   cohort of assigned donor
+#   number of discordant sites
+#   total AD over discordant sites
+#   list of donors in the pool, how many of the discordant sites are found in the donor, cohort each belongs to
+#   list of discordant sites
+
+__date__ = '2023-07-24'
+__version__ = '0.0.1'
+import argparse
+import pickle 
+import pandas as pd
+import random
+import numpy as np
+import multiprocessing as mp
+from multiprocessing import Lock
+import os
+import gzip
+import time
+
+
+class VCF_Loader:
+    
+    def __init__(self, vcf_file, exclude_a2g, biallelic_only=True,
+                        sparse=False, format_list=['GT']):
+        self.vcf_file = vcf_file
+        self.exclude_a2g = exclude_a2g
+        self.load_sample = True
+        self.biallelic_only = biallelic_only
+        self.sparse = sparse
+        self.record_dict={}
+        self.reset()
+        self.format_list = format_list
+        self.exclusive_donor_variants = {}
+        self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
+        self.last_count=-1
+        self.reset_c()
+    
+    def reset_c(self):
+        self.record_times=0
+        
+    def reset(self):
+        self.exclusive_donor_variants ={}
+                
+    def myfunc(self):
+        print(f"Hello my name is {self.biallelic_only}" )
+        
+    def load_sample_mp(self,line,obs_ids,count,format_list):
+        '''
+        takes VCF lines and extracts all format fields for those where GT !='.'
+        '''
+        list_val = line.rstrip().split("\t") #[:5] #:8
+        idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called
+        if len(list_val[3]) > 1 or len(list_val[4]) > 1:
+            # CURRENTLY DEALS ONLY WITH BIALELIC
+            print(f'{idx} var not bialelic')
+        elif list_val[3] == 'A' and list_val[4] == 'G' and self.exclude_a2g:#remove A>G
+            pass
+        elif list_val[3] == 'T' and list_val[4] == 'C' and self.exclude_a2g:#also remove T>C
+            pass
+        else:
+            list_val2 = list_val[9:]
+            obs = pd.DataFrame(obs_ids)
+            lv = pd.DataFrame(list_val2)
+            lv_proc =lv[0].str.split(':').str[idx]
+            gt_exists = lv_proc[lv_proc != '.']
+            idx2 = gt_exists.index
+            obs_with_gt = obs.loc[idx2.values]
+            obs_with_gt = list(obs_with_gt[0].values)
+            list_val_with_gt = lv.loc[idx2.values]
+            list_val_with_gt = list(list_val_with_gt[0].values)
+            random.seed(count)
+            c = list(zip(obs_with_gt, list_val_with_gt))
+            random.shuffle(c)
+            obs_with_gt, list_val_with_gt = zip(*c)
+            # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count])
+
+        return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step
+
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+    
+
+    def append_results(self,result):
+        # exclusive_donor_variants
+        obs_with_gt= result[0]
+        list_val_with_gt= result[1]
+        idx = result[2]
+        list_val = result[3]
+        count = result[4]
+        format_list = result[5]#list of required format fields
+        #get indexes of required format fields (apart from GT which has already been taken care of)
+        additional_field_idxs = []
+        for fmt in format_list:
+            if not fmt == 'GT':
+                idx_addn = find(list_val[8].split(':'), fmt)[0]
+                additional_field_idxs.append(idx_addn)
+        # print(additional_field_idxs)
+        # exit(0)
+
+        count11=0
+        # r = random.random()
+        # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer.
+        # every 500 itterations we push the data to a dictionary, later we combine these together.
+        if (count % 200 == 0):
+            print(f'recording and resetting memory {count}')
+            # self.record_dict[count]=self.exclusive_donor_variants
+            self.set_results(self.exclusive_donor_variants,count)
+            self.reset()  
+            self.reset_c()        
+        
+        for ob_id in obs_with_gt:
+            donor_loc_in_list = count11
+            alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx]
+            #append any additional format fields to alleles
+            if len(additional_field_idxs) > 0:
+                for idx_addnl in additional_field_idxs:
+                    fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl]
+                    alleles = alleles + '_' + fmt_val
+
+            if not alleles.startswith('.'):
+                ids = "_".join([list_val[x] for x in [0, 1, 3, 4]])
+                donor_var = f"{ids}_{alleles}"
+                while ob_id in self.curently_pushing:
+                    time.sleep(r*0.01)
+                self.curently_pushing.append(ob_id)           
+                try:
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                except:
+                    self.exclusive_donor_variants[ob_id]=set()
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                self.curently_pushing.remove(ob_id)
+                # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1']
+            count11+=1 
+
+    def combine_written_files(self):#this is for VCF loader class
+        to_export = self.exclusive_donor_variants
+        for val1 in self.record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    try:
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+                    except:
+                        to_export[k1]=set()
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+            os.remove(val1)
+        return to_export
+    
+    
+    def load_VCF_batch_paralel(self):
+        """
+        Load whole VCF file by utilising multiple cores to speed up loading of large cell files
+        -------------------
+        Initially designed to load VCF from cellSNP output, requiring 
+        1) all variants have the same format list;
+        2) a line starting with "#CHROM", with sample ids.
+        If these two requirements are satisfied, this function also supports general
+        VCF files, e.g., genotype for multiple samples.
+
+        Note, it may take a large memory, please filter the VCF with bcftools first.
+        """
+        
+        vcf_file = self.vcf_file
+        biallelic_only = self.biallelic_only
+        load_sample= self.load_sample
+        sparse = self.sparse
+        format_list= self.format_list
+        pool = mp.Pool(cpus)
+        
+        
+        import time
+        if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz":
+            infile = gzip.open(vcf_file, "rb")
+            is_gzip = True
+        else:
+            infile = open(vcf_file, "r")
+            is_gzip = False
+        
+        FixedINFO = {}
+        contig_lines = []
+        comment_lines = []
+        var_ids, obs_ids, obs_dat = [], [], []
+        count=0 #57077    
+        for line in infile:
+            count+=1
+            # if count>10000:
+            #     break
+            if is_gzip:
+                line = line.decode('utf-8')
+            if line.startswith("#"):
+                if line.startswith("##contig="):
+                    contig_lines.append(line.rstrip())
+                if line.startswith("#CHROM"):
+                    if load_sample:
+                        obs_ids = line.rstrip().split("\t")[9:]
+                        for ob_id in obs_ids:
+                            self.exclusive_donor_variants[ob_id]=set()
+                    key_ids = line[1:].rstrip().split("\t")[:8]
+                    for _key in key_ids:
+                        FixedINFO[_key] = []
+                else:
+                    comment_lines.append(line.rstrip())
+            else:
+                pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results)
+                del line
+        self.last_count=count
+        pool.close()
+        pool.join()
+        
+        output = self.combine_written_files()
+        return output
+    
+
+class Concordances:
+    def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites):
+        self.reset()
+        self.donor_assignments_table=donor_assignments_table
+        self.cell_assignments_table=cell_assignments_table
+        self.exclusive_don_variants=exclusive_don_variants
+        self.exclusive_cell_variants=exclusive_cell_variants
+        self.donor_distinct_sites=donor_distinct_sites
+        self.record_dict={}
+
+
+    def norm_genotypes(self,expected_vars):
+        expected_vars = pd.DataFrame(expected_vars)
+        if len(expected_vars) > 0:
+            split_str=expected_vars[0].str.split("_")
+            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+            expected_vars['vars'] = split_str.str[4]
+            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+            expected_vars = expected_vars[expected_vars['vars']!='./.']
+            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+        return expected_vars
+        
+
+    def reset(self):
+        self.cell_concordance_table ={}
+
+
+    def read_concordance(self, expected_vars, cell_vars):
+        '''
+        get read level concordance using DP, AD and OTH format fields
+        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+        '''
+        if not len(expected_vars) == len(cell_vars):
+            print("length mismatch between expected vars and cell vars")
+            exit(1)
+
+        # print(expected_vars)
+        # print(cell_vars)
+        # exit(0)
+
+        discordant_site_details = {
+            'expect_hom_alt':{},
+            'expect_hom_ref':{},
+            'expect_het':{}
+        }
+
+        total_sites = len(expected_vars)
+        #add cols for DP, AD< OTH
+        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        total_dp = cell_vars['DP'].sum()
+        total_oth = cell_vars['OTH'].sum()
+        total_reads = total_dp + total_oth
+
+        # expected genotype 0/0
+        expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+        hom_ref_sites = set(expected_hom_ref['ids'])
+        cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+        ad_hom_ref = cell_vars2['AD'].sum()
+        oth_hom_ref = cell_vars2['OTH'].sum() 
+        discordant_hom_ref = ad_hom_ref + oth_hom_ref
+        cell_vars2_disc = cell_vars2[(cell_vars2['AD'] > 0) | (cell_vars2['OTH'] > 0)]
+        cell_vars2_disc_sites = set(cell_vars2_disc['ids'])
+        for s2 in cell_vars2_disc_sites:
+            sdf = cell_vars2_disc[cell_vars2_disc['ids'] == s2]
+            if not len(sdf) == 1:
+                print('error, 1 variant expected')
+                print(sdf)
+            discordant_site_details['expect_hom_ref'][s2] = {}
+            discordant_site_details['expect_hom_ref'][s2]['AD'] = sdf['AD'].sum()
+            discordant_site_details['expect_hom_ref'][s2]['found_in_other_donor'] = 'no'
+        # print(cell_vars2_disc)
+        # print(cell_vars2)
+        # exit(0)
+
+        # expected genotype 0/1 or 1/0
+        hets = ['0/1', '1/0']
+        expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+        het_sites = set(expected_het['ids'])
+        cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+        discordant_het = cell_vars3['OTH'].sum()
+        # cell_vars3_disc = cell_vars3[cell_vars3['OTH'] > 0]
+        # cell_vars3_disc_sites = set(cell_vars3_disc['ids'])
+        # for s3 in cell_vars3_disc_sites:
+        #     sdf = cell_vars3_disc[cell_vars3_disc['ids'] == s3]
+        #     if not len(sdf) == 1:
+        #         print('error, 1 variant expected')
+        #         print(sdf)
+            # discordant_site_details['expect_het'][s3] = {}
+            # discordant_site_details['expect_het'][s3]['OTH'] = sdf['OTH'].sum() 
+
+        # expected genotype 1/1
+        expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+        hom_alt_sites = set(expected_hom_alt['ids'])
+        cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+        cell_vars4['non_ref_reads'] = cell_vars4['AD'] = cell_vars4['OTH']
+        cell_vars4['ref_reads'] = cell_vars4['DP'] - cell_vars4['non_ref_reads']
+        # DP + OTH - AD
+        ad_hom_alt = cell_vars4['AD'].sum()
+        dp_hom_alt = cell_vars4['DP'].sum()
+        oth_hom_alt = cell_vars4['OTH'].sum()
+        discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+        cell_vars4_disc = cell_vars4[(cell_vars4['ref_reads'] > 0) | (cell_vars4['OTH'] > 0)]
+        cell_vars4_disc_sites = set(cell_vars4_disc['ids'])
+        for s4 in cell_vars4_disc_sites:
+            sdf = cell_vars4_disc[cell_vars4_disc['ids'] == s4]
+            if not len(sdf) == 1:
+                print('error, 1 variant expected')
+                print(sdf)
+            discordant_site_details['expect_hom_alt'][s4] = {}
+            discordant_site_details['expect_hom_alt'][s4]['REF'] = sdf['ref_reads'].sum()
+            discordant_site_details['expect_hom_alt'][s4]['found_in_other_donor'] = 'no' 
+
+        discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+
+        return total_sites, total_reads, discordant_reads, discordant_site_details
+    
+
+    def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
+        '''
+        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+        sites and relaxed concordant sites and list of discordant sites1
+        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+        2) if you have a 0/0 you can not get a 1/1 or 0/1
+        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+        So - each obversed cellsnp allele must be in the array SNP gtype
+        '''
+        true_discordant = 0
+        relaxed_concordant = 0
+        discordant_vars = []
+        concordant_vars = []
+        for i in range(0, len(snp_gtypes)):
+            discordant = False
+            snp_data = snp_gtypes[i].split('_')
+            cellsnp_data = cellsnp_gtypes[i].split('_')
+
+            # the below will no longer work due to differing length of input strings
+            # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
+            # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
+
+
+            snp_alleles = [snp_data[4][0], snp_data[4][2]]
+            cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
+
+            snp_alleles_set = set(snp_alleles)
+            cellsnp_alleles_set = set(cellsnp_alleles)
+            
+            snp_var = ('_').join(snp_data[0:4])
+            cellsnp_var = ('_').join(cellsnp_data[0:4])
+
+            if not cellsnp_var == snp_var:
+                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
+                exit(1)
+            else:
+                for allele in cellsnp_alleles_set:
+                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                        discordant = True
+            
+            if discordant == True:
+                # print(cellsnp_var)
+                # print(cellsnp_data)
+                # exit(0)
+                true_discordant+=1
+                discordant_vars.append(cellsnp_var)
+            else:
+                concordant_vars.append(cellsnp_var)
+                relaxed_concordant+=1
+
+        return true_discordant, relaxed_concordant, discordant_vars,concordant_vars
+    
+
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):            
+        cell_vars_norm = self.norm_genotypes(cell_vars)
+        if len(cell_vars_norm) > 0:
+            Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)]
+            Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+            #find sites that may be discordant - this will include hets which are not truly discordant
+            Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
+            disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+            df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+            disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+            disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+            #disc_sites = ';'.join(disc2['expected_retrieved'])
+            true_discordant_count, relaxed_concordant_count, discordant_vars, concordant_vars = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
+            total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count
+            #find discordant reads
+            total_sites, total_reads, discordant_reads, discordant_site_details = self.read_concordance(expected_vars2, cell_vars2)
+
+            return total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_site_details, discordant_vars, concordant_vars
+
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+        
+    def append_results_cell_concordances(self,result,cell_concordance_table):
+        #[cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, 
+        # discordant_reads, discordant_vars,discordant_vars_in_pool_str, count]
+        count=result[10]
+        
+        try:
+            percent_discordant = result[4]/(result[4]+result[5])*100
+        except:
+            percent_discordant = 0
+
+        try:
+            read_discordance = result[7]/result[3]
+        except:
+            read_discordance = 0
+
+        # print(count)
+        same_as_asigned_donor = result[12]==result[1]
+        cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
+                                                                'GT 2':result[1],
+                                                                'Cohort': result[2],
+                                                                'Nr_Concordant':result[5],
+                                                                'Nr_Discordant':result[4],
+                                                                'Percent_Discordant':percent_discordant,
+                                                                'Total_sites': result[3],
+                                                                'Total_reads': result[6],
+                                                                'Discordant_reads': result[7],
+                                                                'Discordant_reads_by_n_sites': read_discordance,
+                                                                'Discordant_sites_in_pool': result[9],
+                                                                'Discordant_Site_Identities':(';').join(result[8]),
+                                                                'Lowest_Disconcordance_value_in_all_donors':result[11],
+                                                                'Donor_With_Lowest_DisConcordance':result[12],
+                                                                'Concordant_Site_Identities':result[13],
+                                                                'same_as_asigned_donor':same_as_asigned_donor
+                                                                }   
+        
+        # if (count % 200 == 0):
+        #     print(f'recording and resetting memory {count}')
+        #     # self.record_dict[count]=self.exclusive_donor_variants
+        #     self.set_results(self.cell_concordance_table,count)
+        #     self.reset()  
+        # _=""
+        return cell_concordance_table
+        
+    def combine_written_files(self):#this one is for concordance class
+        to_export = self.cell_concordance_table
+        for val1 in self.record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    to_export[k1]=loaded_dict[k1]
+            os.remove(val1)
+        return to_export
+
+    def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm):
+        donor_concordance_table = {}
+        for cell1 in Cells_to_keep_pre:
+            count+=1
+            # if count>10:
+            #     break
+            cell_vars = exclusive_cell_variants[cell1]
+            # cell_vars_dp = exclusive_cell_variants_dp[cell1]
+
+            # self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
+            # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
+            result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data)
+            # if (result1==None):
+            #     _='test'
+            donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table)
+            # print('Done')
+        return donor_concordance_table
+
+    def combine_concordances(self,result):
+        # print('res')
+        self.cell_concordance_table = self.cell_concordance_table | result
+
+    def conc_table(self):
+        donor_assignments_table=self.donor_assignments_table
+        cell_assignments_table=self.cell_assignments_table
+        exclusive_don_variants=self.exclusive_don_variants
+        exclusive_cell_variants= self.exclusive_cell_variants
+        donor_list = exclusive_don_variants.keys()
+        
+        pool = mp.Pool(cpus)
+        count = 0
+
+        #create a list of variants that are on each donor genotype file
+        vars_per_donor_gt = {}
+        for don_id in donor_list:
+            donor_gt_vars = list(exclusive_don_variants[don_id])
+            donor_gt_vars = pd.DataFrame(donor_gt_vars)
+            donor_gt_vars = self.norm_genotypes(donor_gt_vars)
+            #donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0'] ref/ref now wanted too
+            donor_gt_varids = list(donor_gt_vars['ids'])
+            vars_per_donor_gt[don_id] = donor_gt_varids
+        #work out what cohort each donor belongs to
+        donor_cohorts = {}
+        for don_id in donor_list:
+            cohort = 'UNKNOWN'
+            donor_split = don_id.split("_")
+            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+                cohort = 'UKB'
+            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+                cohort = 'ELGH'
+            donor_cohorts[don_id] = cohort
+
+        all_donor_data={}
+        # here we calvculate all the expected donor datasets
+        for row1 in exclusive_don_variants.keys():
+            # donor_in_question = row1['donor_query']
+            donor_gt_match = row1
+            expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match]
+            expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
+            all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
+        # print(all_donor_data)
+        # exit(0)
+
+        for i,row1 in donor_assignments_table.iterrows():
+            donor_in_question = row1['donor_query']
+            donor_gt_match = row1['donor_gt']
+            if (donor_gt_match=='NONE'):
+                continue
+            
+            donor_gt_match_cohort = donor_cohorts[donor_gt_match]
+            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
+            expected_vars = exclusive_don_variants[donor_gt_match]
+            expected_vars_norm = self.norm_genotypes(expected_vars)
+            try:
+                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                dds = self.donor_distinct_sites[donor_gt_match]
+            except:
+                continue
+            if cpus==1:
+                result_conc = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm)
+                self.combine_concordances(result_conc)
+            else:
+                pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances)          
+                
+        pool.close()
+        pool.join()
+        output = self.combine_written_files()
+        return output
+    
+        
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data):
+        #Nr_donor_distinct_sites = len(dds)
+        total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_site_details, discordant_vars, concordant_vars = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+        #Nr_Concordant = len(Concordant_Sites)
+        #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
+        #Nr_Discordant = len(Discordant_sites)
+        #Nr_Total_Overlapping_sites = len(Total_Overlappin_sites)
+        #Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
+        #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+        #find if the discordant vars are in any of the other donors
+        discordant_vars_in_pool = []
+        donor_table_of_concordances = []
+        # print(discordant_site_details)
+        # exit(0)
+        for donor in vars_per_donor_gt:
+            # if not donor == donor_gt_match:
+            #     donor_cohort = donor_cohorts[donor]
+            #     donor_vars = vars_per_donor_gt[donor]
+            #     common_vars = list(set(discordant_vars) & set(donor_vars))
+            #     common_var_count = str(len(common_vars))
+            #     print(common_vars)
+            #     print(discordant_site_details)
+            #     exit(0)
+                #deal with expected hom alt
+                #deal with expected hom ref
+                #deal with expected het: discordant = OTH
+                # donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
+                # discordant_vars_in_pool.append(donor_cohort_common)
+                
+            # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment.
+            # We do this to investigate the potential of a cell coming from this other donor.
+            
+            expected_vars_norm_of_other_donor = all_donor_data[donor]
+            total_sites_otherDonor, true_discordant_count_otherDonor, total_concordant_sites_otherDonor, total_reads_otherDonor, discordant_reads_otherDonor, discordant_site_detail_otherDonor, discordant_vars_otherDonor, concordant_vars_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars)
+            concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100
+            discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100
+            donor_table_of_concordances.append({'donor':donor,'concordant_percent_in_other_donor':concordant_percent_in_other_donor,'discordant_percent_in_other_donor':discordant_percent_in_other_donor})
+            
+            if not donor == donor_gt_match:
+                print(donor)
+                #print(expected_vars_norm_of_other_donor)
+                donor_cohort = donor_cohorts[donor]
+                donor_vars = vars_per_donor_gt[donor]
+                common_vars = list(set(discordant_vars) & set(donor_vars))
+                #common_var_count = str(len(common_vars))
+                common_var_count = 0
+                ####deal with expected hom alt
+                print('looking at home alts')
+                expected_hom_alts = list(discordant_site_details['expect_hom_alt'].keys())
+                # print(expected_hom_alts)
+                # exit(0)
+                mask = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_alts)
+                expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[mask]
+                #expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'] in expected_hom_alts]
+                # print(expected_hom_alts_in_other_donors)
+                #can any other donors contribute a ref read?
+                refgts = ['0/0', '0/1', '1/0']
+                mask2 = expected_hom_alts_in_other_donors['vars'].isin(refgts)
+                expected_hom_alts_in_other_donors_has_ref = expected_hom_alts_in_other_donors[mask2]
+                if (len(expected_hom_alts_in_other_donors_has_ref) > 0):
+                    has_ref_vars = list(expected_hom_alts_in_other_donors_has_ref['ids'])
+                    for v in has_ref_vars:
+                        discordant_site_details['expect_hom_alt'][v]['found_in_other_donor'] = 'yes'
+                        common_var_count += 1
+                ####deal with expected hom ref
+                print('looking at hom refs')
+                expected_hom_refs = list(discordant_site_details['expect_hom_ref'].keys())
+                # print(expected_hom_refs)
+                mask3 = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_refs)
+                expected_hom_refs_in_other_donors = expected_vars_norm_of_other_donor[mask3]
+                # print(expected_hom_refs_in_other_donors)
+                #can any other donors contribute and alt read?
+                altgts = ['0/1', '1/0', '1/1']
+                mask4 = expected_hom_refs_in_other_donors['vars'].isin(altgts)
+                expected_hom_refs_in_other_donors_has_alt = expected_hom_refs_in_other_donors[mask4]
+                print(expected_hom_refs_in_other_donors_has_alt)
+                if (len(expected_hom_refs_in_other_donors_has_alt) > 0):
+                    has_alt_vars = list(expected_hom_refs_in_other_donors_has_alt['ids'])
+                    # print(has_alt_vars)
+                    for v in has_alt_vars:
+                        discordant_site_details['expect_hom_ref'][v]['found_in_other_donor'] = 'yes'
+                        common_var_count += 1
+                        # print(common_var_count)
+                ###deal with expected het: discordant = OTH
+                #skip hets for now
+                print('one')
+                print(donor)
+                print(common_var_count)
+                print(donor_cohort)
+                donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
+                print(donor_cohort_common)
+                print('two')
+                discordant_vars_in_pool.append(donor_cohort_common)
+                print('three')
+                #work out how many ref/alt reads could be contamination and how many must be errors
+        print(discordant_site_details)
+        discordant_ref_expected_error = 0
+        discordant_alt_expected_error = 0
+        discordant_ref_expected_contamination = 0
+        discordant_alt_expected_contamination = 0
+        for v1 in discordant_site_details['expect_hom_alt'].keys():
+            if discordant_site_details['expect_hom_alt'][v1]['found_in_other_donor'] == 'no':
+                discordant_alt_expected_error = discordant_alt_expected_error + discordant_site_details['expect_hom_alt'][v1]['REF']
+            else:
+                discordant_alt_expected_contamination = discordant_alt_expected_contamination + discordant_site_details['expect_hom_alt'][v1]['REF']
+
+        for v2 in discordant_site_details['expect_hom_ref'].keys():
+            if discordant_site_details['expect_hom_ref'][v2]['found_in_other_donor'] == 'no':
+                discordant_ref_expected_error = discordant_ref_expected_error + discordant_site_details['expect_hom_ref'][v2]['REF']
+            else:
+                discordant_ref_expected_contamination = discordant_ref_expected_contamination + discordant_site_details['expect_hom_ref'][v2]['REF']
+
+
+        discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)
+        concordant_vars_in_pool_str = (";").join(concordant_vars)
+
+        #print(discordant_vars_in_pool_str)
+        print(discordant_ref_expected_error)
+        print(discordant_alt_expected_error)
+        print(discordant_ref_expected_contamination)
+        print(discordant_alt_expected_contamination)
+        print(discordant_site_details)
+        exit(0)
+
+        DF = pd.DataFrame(donor_table_of_concordances)
+        Donor_With_Lowest_DisConcordance = DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values[0]
+        Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
+        
+        return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str]
+        #return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
+        #        Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads]
+    
+
+def find(lst, a):
+    return [i for i, x in enumerate(lst) if x==a ]
+    
+    
+def norm_genotypes(expected_vars):
+    expected_vars = pd.DataFrame(expected_vars)
+    split_str=expected_vars[0].str.split("_")
+    expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+    expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+    expected_vars['vars'] = split_str.str[4]
+    expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+    expected_vars = expected_vars[expected_vars['vars']!='./.']
+    expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+    expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+    return expected_vars
+
+
+def donor_exclusive_sites(exclusive_don_variants2):
+    # Here we generate a function for determining the sites that are donor exclusive
+    donor_distinct_sites = {}
+    for col1 in exclusive_don_variants2.keys():
+        comparisons =[]
+        to_compare = []
+        for col2 in exclusive_don_variants2.keys():
+            if col1==col2:
+                # we set this as the unique entry
+                # print('1')
+                to_compare = set(exclusive_don_variants2[col2])
+            else:
+                # We combine all the variants in one list
+                comparisons+=list(exclusive_don_variants2[col2])
+                # print('2')
+        # print('comparison')  
+        comparisons_all = set(comparisons) 
+        comparisons_all_norm = norm_genotypes(comparisons_all)
+        comparisons_all=set(comparisons_all_norm['combo'])
+        
+        to_compare = set(to_compare)
+        to_compare_norm = norm_genotypes(to_compare)
+        to_compare=set(to_compare_norm['combo'])
+        # Make sure we account for hap types - phased/unphased 
+        distinct_donor_sites = to_compare - comparisons_all
+        donor_distinct_sites[col1]=distinct_donor_sites
+        # Perform the distinct set function.
+    return donor_distinct_sites   
+
+
+def get_options():
+    '''
+    Get options from the command line
+    '''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cpus', action='store', required=True, type=int)
+    parser.add_argument('--cell_vcf', action='store', required=True)
+    parser.add_argument('--cell_assignments', action='store', required=True)
+    parser.add_argument('--donor_assignments', action='store', required=True)
+    parser.add_argument('--gt_match_vcf', action='store', required=True)
+    parser.add_argument('--expected_vcf', action='store', required=True)
+    parser.add_argument('--outfile', action='store', required=True)
+    parser.add_argument('--exclude_a2g', action='store_true')
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == "__main__":
+
+    options = get_options()
+    cpus = options.cpus
+    cell_vcf=options.cell_vcf
+    donor_assignments=options.donor_assignments
+    gt_match_vcf=options.gt_match_vcf
+    expected_vcf=options.expected_vcf
+    cell_assignments=options.cell_assignments
+    outfile = options.outfile
+    exclude_a2g = options.exclude_a2g
+
+    exclusive_donor_variants = {} #This is where results are populated when mp process i used.
+    curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
+    All_Results={}
+    cell_concordance_table = {}
+
+    donor_assignments_table = pd.read_csv(donor_assignments)
+    cell_assignments_table = pd.read_csv(cell_assignments,sep='\t')
+
+    if options.debug:
+        with open('tmp_GT_Expected_variants.pkl', 'rb') as f:
+            GT_Expected_variants = pickle.load(f)
+        with open('tmp_GT_Matched_variants.pkl', 'rb') as f:
+            GT_Matched_variants = pickle.load(f)   
+        with open('tmp_exclusive_cell_variants.pkl', 'rb') as f:
+            exclusive_cell_variants = pickle.load(f) 
+        with open('tmp_donor_distinct_sites.pkl', 'rb') as f:
+            donor_distinct_sites = pickle.load(f)  
+        with open('tmp_exclusive_don_variants.pkl', 'rb') as f:
+            exclusive_don_variants = pickle.load(f) 
+    else:  
+        print('---Loading genotype VCF----')   
+        if (os.path.exists(gt_match_vcf)):
+            loader2 = VCF_Loader(gt_match_vcf, exclude_a2g, biallelic_only=True,
+                            sparse=False, format_list=['GT'])
+            GT_Matched_variants = loader2.load_VCF_batch_paralel()
+            del loader2
+        else:
+            GT_Matched_variants = {}
+        
+        with open(f'tmp_GT_Matched_variants.pkl', 'wb') as f:
+            pickle.dump(GT_Matched_variants, f)
+        
+        print('---Loading cell VCF----')
+        tic = time.perf_counter()
+        loader1 = VCF_Loader(cell_vcf, exclude_a2g, biallelic_only=True,
+                            sparse=False, format_list=['GT', 'DP', 'AD', 'OTH'])
+        exclusive_cell_variants = loader1.load_VCF_batch_paralel()
+        del loader1
+        toc = time.perf_counter()
+            
+        with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f:
+            pickle.dump(exclusive_cell_variants, f)
+        print(f"Loading took {toc - tic:0.4f} seconds")
+
+        print('---Loading expected VCF----')
+        loader3 = VCF_Loader(expected_vcf, exclude_a2g, biallelic_only=True,
+                        sparse=False, format_list=['GT'])
+        GT_Expected_variants = loader3.load_VCF_batch_paralel()
+        del loader3
+
+        with open(f'tmp_GT_Expected_variants.pkl', 'wb') as f:
+            pickle.dump(GT_Expected_variants, f)
+ 
+        print('---Variant files loaded----')       
+        
+        exclusive_don_variants = GT_Expected_variants.keys()
+        content = [x for x in exclusive_don_variants if not x.startswith('donor')]
+        GT_Expected_variants = {key: GT_Expected_variants[key] for key in content}
+        
+        exclusive_don_variants = GT_Matched_variants.keys()
+        content = [x for x in exclusive_don_variants if not x.startswith('donor')]
+        GT_Matched_variants = {key: GT_Matched_variants[key] for key in content}
+        
+        exclusive_don_variants = GT_Expected_variants
+        for key in GT_Matched_variants.keys():
+            if key in exclusive_don_variants.keys():
+                _=''
+            else:
+                exclusive_don_variants[key]=GT_Matched_variants[key]
+        
+        with open(f'tmp_exclusive_don_variants.pkl', 'wb') as f:
+            pickle.dump(exclusive_don_variants, f)
+        donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants)
+        with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f:
+            pickle.dump(donor_distinct_sites, f)
+        
+    print('---donor_distinct_sites calculated----')
+
+
+    conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites)
+    cell_concordance_table = conc1.conc_table()
+    
+    result = pd.DataFrame(cell_concordance_table).T
+    result.to_csv(outfile,sep='\t')
+    site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']]
+    result.drop(columns=['Concordant_Site_Identities'],inplace=True)
+    result.to_csv(outfile,sep='\t')
+    site_identities.to_csv(f"site_identities_{outfile}",sep='\t')
+    print('Processing Done')
\ No newline at end of file

From 22cd7cda2aa2f1778c987015a0235f40b1d8a3de Mon Sep 17 00:00:00 2001
From: Ruth Eberhardt <re3@sanger.ac.uk>
Date: Thu, 17 Aug 2023 13:37:29 +0100
Subject: [PATCH 2/3] read counts - error vs contamination

---
 ...ther_donors_find_best_donor_read_counts.py | 50 ++++++-------------
 1 file changed, 15 insertions(+), 35 deletions(-)

diff --git a/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py
index 9700f255..34764c9a 100755
--- a/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py
+++ b/bin/find_discordant_sites_in_other_donors_find_best_donor_read_counts.py
@@ -302,6 +302,7 @@ def read_concordance(self, expected_vars, cell_vars):
                 print(sdf)
             discordant_site_details['expect_hom_ref'][s2] = {}
             discordant_site_details['expect_hom_ref'][s2]['AD'] = sdf['AD'].sum()
+            discordant_site_details['expect_hom_ref'][s2]['OTH'] = sdf['OTH'].sum()
             discordant_site_details['expect_hom_ref'][s2]['found_in_other_donor'] = 'no'
         # print(cell_vars2_disc)
         # print(cell_vars2)
@@ -343,6 +344,7 @@ def read_concordance(self, expected_vars, cell_vars):
                 print(sdf)
             discordant_site_details['expect_hom_alt'][s4] = {}
             discordant_site_details['expect_hom_alt'][s4]['REF'] = sdf['ref_reads'].sum()
+            discordant_site_details['expect_hom_alt'][s4]['OTH'] = sdf['OTH'].sum()
             discordant_site_details['expect_hom_alt'][s4]['found_in_other_donor'] = 'no' 
 
         discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
@@ -434,6 +436,7 @@ def set_results(self,to_set,id):
     def append_results_cell_concordances(self,result,cell_concordance_table):
         #[cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, 
         # discordant_reads, discordant_vars,discordant_vars_in_pool_str, count]
+        # print(result)
         count=result[10]
         
         try:
@@ -463,7 +466,11 @@ def append_results_cell_concordances(self,result,cell_concordance_table):
                                                                 'Lowest_Disconcordance_value_in_all_donors':result[11],
                                                                 'Donor_With_Lowest_DisConcordance':result[12],
                                                                 'Concordant_Site_Identities':result[13],
-                                                                'same_as_asigned_donor':same_as_asigned_donor
+                                                                'same_as_asigned_donor':same_as_asigned_donor,
+                                                                'discordant_reads_error_expected_ref':result[14],
+                                                                'discordant_reads_error_expected_alt':result[15],
+                                                                'discordant_reads_contamination_expected_ref':result[16],
+                                                                'discordant_reads_contamination_expected_alt':result[17]
                                                                 }   
         
         # if (count % 200 == 0):
@@ -615,22 +622,17 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
             donor_table_of_concordances.append({'donor':donor,'concordant_percent_in_other_donor':concordant_percent_in_other_donor,'discordant_percent_in_other_donor':discordant_percent_in_other_donor})
             
             if not donor == donor_gt_match:
-                print(donor)
-                #print(expected_vars_norm_of_other_donor)
                 donor_cohort = donor_cohorts[donor]
                 donor_vars = vars_per_donor_gt[donor]
                 common_vars = list(set(discordant_vars) & set(donor_vars))
                 #common_var_count = str(len(common_vars))
                 common_var_count = 0
                 ####deal with expected hom alt
-                print('looking at home alts')
+                # print('looking at home alts')
                 expected_hom_alts = list(discordant_site_details['expect_hom_alt'].keys())
                 # print(expected_hom_alts)
-                # exit(0)
                 mask = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_alts)
                 expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[mask]
-                #expected_hom_alts_in_other_donors = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'] in expected_hom_alts]
-                # print(expected_hom_alts_in_other_donors)
                 #can any other donors contribute a ref read?
                 refgts = ['0/0', '0/1', '1/0']
                 mask2 = expected_hom_alts_in_other_donors['vars'].isin(refgts)
@@ -641,37 +643,24 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
                         discordant_site_details['expect_hom_alt'][v]['found_in_other_donor'] = 'yes'
                         common_var_count += 1
                 ####deal with expected hom ref
-                print('looking at hom refs')
+                # print('looking at hom refs')
                 expected_hom_refs = list(discordant_site_details['expect_hom_ref'].keys())
-                # print(expected_hom_refs)
                 mask3 = expected_vars_norm_of_other_donor['ids'].isin(expected_hom_refs)
                 expected_hom_refs_in_other_donors = expected_vars_norm_of_other_donor[mask3]
-                # print(expected_hom_refs_in_other_donors)
                 #can any other donors contribute and alt read?
                 altgts = ['0/1', '1/0', '1/1']
                 mask4 = expected_hom_refs_in_other_donors['vars'].isin(altgts)
                 expected_hom_refs_in_other_donors_has_alt = expected_hom_refs_in_other_donors[mask4]
-                print(expected_hom_refs_in_other_donors_has_alt)
                 if (len(expected_hom_refs_in_other_donors_has_alt) > 0):
                     has_alt_vars = list(expected_hom_refs_in_other_donors_has_alt['ids'])
-                    # print(has_alt_vars)
                     for v in has_alt_vars:
                         discordant_site_details['expect_hom_ref'][v]['found_in_other_donor'] = 'yes'
                         common_var_count += 1
-                        # print(common_var_count)
-                ###deal with expected het: discordant = OTH
+                ###deal with expected het: discordant = OTH - leaving this for nnow
                 #skip hets for now
-                print('one')
-                print(donor)
-                print(common_var_count)
-                print(donor_cohort)
-                donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
-                print(donor_cohort_common)
-                print('two')
+                donor_cohort_common = (':').join([donor, donor_cohort, str(common_var_count)])
                 discordant_vars_in_pool.append(donor_cohort_common)
-                print('three')
                 #work out how many ref/alt reads could be contamination and how many must be errors
-        print(discordant_site_details)
         discordant_ref_expected_error = 0
         discordant_alt_expected_error = 0
         discordant_ref_expected_contamination = 0
@@ -684,27 +673,18 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
 
         for v2 in discordant_site_details['expect_hom_ref'].keys():
             if discordant_site_details['expect_hom_ref'][v2]['found_in_other_donor'] == 'no':
-                discordant_ref_expected_error = discordant_ref_expected_error + discordant_site_details['expect_hom_ref'][v2]['REF']
+                discordant_ref_expected_error = discordant_ref_expected_error + discordant_site_details['expect_hom_ref'][v2]['AD']
             else:
-                discordant_ref_expected_contamination = discordant_ref_expected_contamination + discordant_site_details['expect_hom_ref'][v2]['REF']
-
+                discordant_ref_expected_contamination = discordant_ref_expected_contamination + discordant_site_details['expect_hom_ref'][v2]['AD']
 
         discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)
         concordant_vars_in_pool_str = (";").join(concordant_vars)
 
-        #print(discordant_vars_in_pool_str)
-        print(discordant_ref_expected_error)
-        print(discordant_alt_expected_error)
-        print(discordant_ref_expected_contamination)
-        print(discordant_alt_expected_contamination)
-        print(discordant_site_details)
-        exit(0)
-
         DF = pd.DataFrame(donor_table_of_concordances)
         Donor_With_Lowest_DisConcordance = DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values[0]
         Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
         
-        return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str]
+        return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str, discordant_ref_expected_error, discordant_alt_expected_error, discordant_ref_expected_contamination, discordant_alt_expected_contamination]
         #return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
         #        Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads]
     

From 0dfb0d38c01901d7ffcb971a72569785f120db1e Mon Sep 17 00:00:00 2001
From: Ruth Eberhardt <re3@sanger.ac.uk>
Date: Fri, 29 Sep 2023 09:31:23 +0100
Subject: [PATCH 3/3] new metrics - discordance at informative and
 uninformative sites

---
 ...ations_donor_exclusive_read_level_noA2G.py | 123 +++++++++++++-----
 1 file changed, 94 insertions(+), 29 deletions(-)

diff --git a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py
index d711e054..cc9ebc64 100755
--- a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py
+++ b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py
@@ -72,6 +72,8 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
             true_discordant = 0
             relaxed_concordant = 0
             relaxed_concordant_informative = 0
+            relaxed_concordant_uninformative = 0
+            true_discordant_informative = 0
             true_discordant_uninformative = 0
 
             for i in range(0, len(snp_gtypes)):
@@ -105,12 +107,16 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
                     true_discordant+=1
                     if snp_var in self.uninformative_sites:
                         true_discordant_uninformative+=1
+                    elif snp_var in self.informative_sites:
+                        true_discordant_informative+=1
                 else:
                     relaxed_concordant+=1
-                    if snp_var in self.informative_sites:
+                    if snp_var in self.uninformative_sites:
+                        relaxed_concordant_uninformative+=1
+                    elif snp_var in self.informative_sites:
                         relaxed_concordant_informative+=1
 
-            return true_discordant, relaxed_concordant, relaxed_concordant_informative, true_discordant_uninformative
+            return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative
 
 
         def read_condordance(self, expected_vars, cell_vars):
@@ -129,38 +135,76 @@ def read_condordance(self, expected_vars, cell_vars):
             cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
             cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
             cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+            #split to informative and uninformative sites
+            mask_i = cell_vars['ids'].isin(self.informative_sites)
+            cell_vars_informative = cell_vars[mask_i]
+            mask_u = cell_vars['ids'].isin(self.uninformative_sites)
+            cell_vars_uninformative = cell_vars[mask_u]
+            informative_sites = len(cell_vars_informative)
+            uninformative_sites = len(cell_vars_uninformative)
+
             total_dp = cell_vars['DP'].sum()
             total_oth = cell_vars['OTH'].sum()
             total_reads = total_dp + total_oth
+            total_dp_inf = cell_vars_informative['DP'].sum()
+            total_oth_inf = cell_vars_informative['OTH'].sum()
+            total_reads_informative = total_dp_inf + total_oth_inf
+            total_dp_uninf = cell_vars_uninformative['DP'].sum()
+            total_oth_uninf = cell_vars_uninformative['OTH'].sum()
+            total_reads_uninformative = total_dp_uninf + total_oth_uninf            
 
             # expected genotype 0/0
             expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
             hom_ref_sites = set(expected_hom_ref['ids'])
             cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+            cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)]
+            cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)]
             ad_hom_ref = cell_vars2['AD'].sum()
             oth_hom_ref = cell_vars2['OTH'].sum() 
             discordant_hom_ref = ad_hom_ref + oth_hom_ref
+            ad_hom_ref_inf = cell_vars_inf_2['AD'].sum()
+            oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() 
+            discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf
+            ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum()
+            oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() 
+            discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf
 
             # expected genotype 0/1 or 1/0
             hets = ['0/1', '1/0']
             expected_het = expected_vars[expected_vars['vars'].isin(hets)]
             het_sites = set(expected_het['ids'])
             cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+            cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)]
+            cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)]
             discordant_het = cell_vars3['OTH'].sum()
+            discordant_het_informative = cell_vars_inf_3['OTH'].sum()
+            discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum()
 
             # expected genotype 1/1
             expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
             hom_alt_sites = set(expected_hom_alt['ids'])
             cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+            cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)]
+            cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)]
             # DP + OTH - AD
             ad_hom_alt = cell_vars4['AD'].sum()
             dp_hom_alt = cell_vars4['DP'].sum()
             oth_hom_alt = cell_vars4['OTH'].sum()
             discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+            ad_hom_alt_inf = cell_vars_inf_4['AD'].sum()
+            dp_hom_alt_inf = cell_vars_inf_4['DP'].sum()
+            oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum()
+            discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf
+            ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum()
+            dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum()
+            oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum()
+            discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf
 
             discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+            discordant_reads_informative =  discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative
+            discordant_reads_uninformative =  discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative
 
-            return total_sites, total_reads, discordant_reads
+            return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative
         
 
         def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
@@ -171,28 +215,28 @@ def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
             cell_vars_norm = self.norm_genotypes(cell_vars)
 
             if len(cell_vars_norm) > 0:
-                Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
-                expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)]
-                cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)]
+                Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+                expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+                cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
                 # print(cell_vars_norm)
                 # print(expected_vars2)
                 # print(cell_vars2)
                 # exit(0)
                 Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
-                Discodrant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
-                disc = pd.DataFrame(Discodrant_sites,columns=['combo_x'])
+                Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
+                disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
                 df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
                 disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
                 disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
                 disc_sites = ';'.join(disc2['expected_retrieved'])
                 #find truly discordant sites
-                true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
+                true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
                 #find discordant reads
-                total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2)
+                total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative = self.read_condordance(expected_vars2, cell_vars2)
             else:
-                Total_Overlappin_sites = set()
+                Total_Overlapping_sites = set()
                 Concordant_Sites = set()
-                Discodrant_sites = set()
+                Discordant_sites = set()
                 disc_sites = ''
                 true_discordant_count = 0
                 relaxed_concordant_count = 0
@@ -200,7 +244,7 @@ def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
       
                 discordant_reads = 0
 
-            return Concordant_Sites, Discodrant_sites, Total_Overlappin_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads
+            return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative
         
 
         def set_results(self,to_set,id):
@@ -210,7 +254,7 @@ def set_results(self,to_set,id):
             self.record_dict[id]=f'tmp_{id}.pkl'
         
         def append_results_cell_concordances(self,result):
-            count=result[11]
+            count=result[13]
             try:
                 percent_concordant = result[2]/(result[3]+result[2])*100
             except:
@@ -232,13 +276,22 @@ def append_results_cell_concordances(self,result):
                 percent_strict_discordant = 0
 
             try:
-                read_discordance = result[15]/result[13]
+                read_discordance = result[21]/result[15]
             except:
                 read_discordance = 0
 
+            donor = result[1]
+            cohort = 'UNKNOWN'
+            donor_split = donor.split("_")
+            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+                cohort = 'UKB'
+            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+                cohort = 'ELGH'
+
             print(count)
             self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
                                                                     'GT 2':result[1],
+                                                                    'cohort': cohort,
                                                                     'Nr_Concordant':result[2],
                                                                     'Nr_Discordant':result[3],
                                                                     'Nr_Relaxed_concordant':result[4],
@@ -248,14 +301,22 @@ def append_results_cell_concordances(self,result):
                                                                     'Percent_relaxed_concordant': percent_relaxed_concordant,
                                                                     'Percent_strict_discordant': percent_strict_discordant,
                                                                     'Nr_concordant_informative': result[6],
-                                                                    'Nr_discordant_uninformative': result[7],
-                                                                    'NrTotal_Overlapping_sites_between_two_genotypes':result[8],
-                                                                    'Nr_donor_distinct_sites_within_pool_individuals':result[10],
-                                                                    'Number_of_sites_that_are_donor_concordant_and_exclusive':result[9],
-                                                                    'Discordant_Site_Identities':result[12],
-                                                                    'Total_sites': result[13],
-                                                                    'Total_reads': result[14],
-                                                                    'Discordant_reads': result[15],
+                                                                    'Nr_concordant_uninformative': result[7],
+                                                                    'Nr_discordant_informative': result[8],
+                                                                    'Nr_discordant_uninformative': result[9],
+                                                                    'NrTotal_Overlapping_sites_between_two_genotypes':result[10],
+                                                                    'Nr_donor_distinct_sites_within_pool_individuals':result[12],
+                                                                    'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11],
+                                                                    'Discordant_Site_Identities':result[14],
+                                                                    'Total_sites': result[15],
+                                                                    'Total_informative_sites': result[16],
+                                                                    'Total_uninformative_sites': result[17],
+                                                                    'Total_reads': result[18],
+                                                                    'Total_reads_informative': result[19],
+                                                                    'Total_reads_uninformative': result[20],
+                                                                    'Discordant_reads': result[21],
+                                                                    'Discordant_reads_informtive': result[22],
+                                                                    'Discordant_reads_uninformtive': result[23],
                                                                     'Discordant_reads_by_n_sites': read_discordance
                                                                     }   
             
@@ -320,16 +381,18 @@ def conc_table(self):
         
         def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
             Nr_donor_distinct_sites = len(dds)
-            Concordant_Sites, Discodrant_sites, Total_Overlappin_sites,discordant_sites,cell_vars_norm, Nr_strict_discordant, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+            Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
             Nr_Concordant = len(Concordant_Sites)
             Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
-            Nr_Discordant = len(Discodrant_sites)
-            Nr_Total_Overlapping_sites = len(Total_Overlappin_sites)
+            Nr_Discordant = len(Discordant_sites)
+            Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
             Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
             Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
 
-            return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
-                    Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads]
+            return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, true_discordant_count, relaxed_concordant_informative_count, 
+                    relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
+                    Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, 
+                    uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative]
         
         
 class VCF_Loader:
@@ -549,6 +612,7 @@ def get_options():
     parser.add_argument('--expected_vcf', action='store', required=True)
     parser.add_argument('--informative_sites', action='store', required=True)
     parser.add_argument('--uninformative_sites', action='store', required=True)
+    parser.add_argument('--outfile', action='store', required=True)
     args = parser.parse_args()
 
     return args
@@ -621,6 +685,7 @@ def donor_exclusive_sites(exclusive_don_variants2):
 
     options = get_options()
     cpus = options.cpus
+    outfile = options.outfile
     cell_vcf=options.cell_vcf
     donor_assignments=options.donor_assignments
     gt_match_vcf=options.gt_match_vcf
@@ -715,6 +780,6 @@ def donor_exclusive_sites(exclusive_don_variants2):
     
     # cell_concordance_table = conc_table(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants)
     result = pd.DataFrame(cell_concordance_table).T
-    result.to_csv('cell_concordance_table_noA2G.tsv',sep='\t')
+    result.to_csv(outfile,sep='\t')
     print('Processing Done')