From 14f817a4ec0441f62545b924ae57eced2b643b4b Mon Sep 17 00:00:00 2001
From: Ruth Eberhardt <re3@sanger.ac.uk>
Date: Tue, 17 Oct 2023 15:18:12 +0100
Subject: [PATCH 1/7] bug fix and add subsample of informative sites

---
 ...ance_calculations_subsample_informative.py | 910 ++++++++++++++++++
 1 file changed, 910 insertions(+)
 create mode 100755 bin/concordance_calculations_subsample_informative.py

diff --git a/bin/concordance_calculations_subsample_informative.py b/bin/concordance_calculations_subsample_informative.py
new file mode 100755
index 00000000..4aae389a
--- /dev/null
+++ b/bin/concordance_calculations_subsample_informative.py
@@ -0,0 +1,910 @@
+#!/usr/bin/env python3
+
+__date__ = '2023-05-10'
+__version__ = '0.0.1'
+import argparse
+import sys
+import importlib.util
+import random
+import pickle 
+import pandas as pd
+import gzip
+import numpy as np
+import time
+import multiprocessing as mp
+from multiprocessing import Lock
+import logging
+import os
+
+
+class Concordances:
+        def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
+            self.reset()
+            self.donor_assignments_table=donor_assignments_table
+            self.cell_assignments_table=cell_assignments_table
+            self.exclusive_don_variants=exclusive_don_variants
+            self.exclusive_cell_variants=exclusive_cell_variants
+            self.donor_distinct_sites=donor_distinct_sites
+            self.informative_sites = informative_sites
+            self.uninformative_sites = uninformative_sites
+            self.record_dict={}
+
+        def norm_genotypes(self,expected_vars):
+            expected_vars = pd.DataFrame(expected_vars)
+            if len(expected_vars) > 0:
+                split_str=expected_vars[0].str.split("_")
+                expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+                expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+                expected_vars['vars'] = split_str.str[4]
+                expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+                expected_vars = expected_vars[expected_vars['vars']!='./.']
+                expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+                expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+            return expected_vars
+        
+        def reset(self):
+            self.cell_concordance_table ={}
+
+        # def get_sites_from_tsv(self, sites_file):
+        #     """
+        #     get sites frm a tsv file where cols are chrom, pos, id, ref, alt
+        #     assumes no multiallelics
+        #     """
+        #     sites = set()
+        #     with open(sites_file, 'r') as f:
+        #         lines = f.readlines()
+        #         for l in lines:
+        #             linedata = l.split('\t')
+        #             var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
+        #             sites.add(var)
+        #     return sites
+
+
+        def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
+            '''
+            take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+            sites and relaxed concordant sites
+            1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+            2) if you have a 0/0 you can not get a 1/1 or 0/1
+            3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+            So - each obversed cellsnp allele must be in the array SNP gtype
+            '''
+            true_discordant = 0
+            relaxed_concordant = 0
+            relaxed_concordant_informative = 0
+            relaxed_concordant_uninformative = 0
+            true_discordant_informative = 0
+            true_discordant_uninformative = 0
+            subset_informative_concordant = 0
+            subset_informative_discordant = 0
+
+            #print(self.uninformative_sites)
+            #print(self.informative_sites)
+
+            #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
+            #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
+            # with the same number of vars (at random)
+            split_snp_gts=snp_gtypes.str.split("_")
+            snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
+
+            split_cellsnp_gts=cellsnp_gtypes.str.split("_")
+            cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
+
+            shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
+
+            shared_informative = shared_gts.intersection(self.informative_sites)
+            shared_uninformative = shared_gts.intersection(self.uninformative_sites)
+            # print("shared informative " + str(len(shared_informative)))
+            # print("shared uninformative " + str(len(shared_uninformative)))
+
+            #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
+            #are the sites used for concordance
+            self.informative_covered = len(shared_informative)
+            self.uninformative_covered = len(shared_uninformative)
+
+            if len(shared_uninformative) > 0:
+                #print(len(shared_uninformative))
+                # print(len(shared_informative))
+                if len(shared_uninformative) <= len(shared_informative):
+                    informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
+                else:
+                    informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
+                # print(informative_subset)
+                # exit(0)
+            else:
+                informative_subset = set()
+
+            # print(informative_subset)
+            self.informative_subset = informative_subset
+
+            snp_gtypes_set = set(snp_gtypes)
+            snp_gtypes_set = sorted(snp_gtypes_set)
+
+            cellsnp_gtypes_set = set(cellsnp_gtypes)
+            cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
+
+            #for i in range(0, len(snp_gtypes)):
+            for i in range(0, len(snp_gtypes_set)):
+                discordant = False
+                # snp_data = snp_gtypes[i].split('_')
+                # cellsnp_data = cellsnp_gtypes[i].split('_')
+                snp_data = snp_gtypes_set[i].split('_')
+                cellsnp_data = cellsnp_gtypes_set[i].split('_')
+
+                # the below will no longer work due to differing length of input strings
+                # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
+                # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
+
+
+                snp_alleles = [snp_data[4][0], snp_data[4][2]]
+                cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
+
+                snp_alleles_set = set(snp_alleles)
+                cellsnp_alleles_set = set(cellsnp_alleles)
+               
+                snp_var = ('_').join(snp_data[0:4])
+                cellsnp_var = ('_').join(cellsnp_data[0:4])
+
+                if not cellsnp_var == snp_var:
+                    print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
+                    exit(1)
+                else:
+                    for allele in cellsnp_alleles_set:
+                        if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                            discordant = True
+                
+                if discordant == True:
+                    true_discordant+=1
+                    if snp_var in self.uninformative_sites:
+                        true_discordant_uninformative+=1
+                    elif snp_var in self.informative_sites:
+                        true_discordant_informative+=1
+                else:
+                    relaxed_concordant+=1
+                    if snp_var in self.uninformative_sites:
+                        relaxed_concordant_uninformative+=1
+                    elif snp_var in self.informative_sites:
+                        relaxed_concordant_informative+=1
+
+                
+                if len(shared_uninformative) > 0:
+                    if snp_var in informative_subset:
+                        if discordant == True:
+                            subset_informative_discordant+=1
+                        else:
+                            subset_informative_concordant+=1
+
+            # print("conc inf " + str(relaxed_concordant_informative))
+            # print("disc inf " + str(true_discordant_informative))
+
+            return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative, subset_informative_concordant, subset_informative_discordant
+
+
+        def read_condordance(self, expected_vars, cell_vars):
+            '''
+            get read level concordance using DP, AD and OTH format fields
+            ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+            ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+            ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+            '''
+            # print(len(expected_vars))
+            # print(len(cell_vars))
+
+            if not len(expected_vars) == len(cell_vars):
+                print("length mismatch between expected vars and cell vars")
+                exit(1)
+
+            total_sites = len(expected_vars)
+            #add cols for DP, AD< OTH
+            cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+            cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+            cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+            #split to informative and uninformative sites
+            mask_i = cell_vars['ids'].isin(self.informative_sites)
+            cell_vars_informative = cell_vars[mask_i]
+            mask_u = cell_vars['ids'].isin(self.uninformative_sites)
+            cell_vars_uninformative = cell_vars[mask_u]
+            informative_sites = len(cell_vars_informative)
+            uninformative_sites = len(cell_vars_uninformative)
+            mask_s = cell_vars['ids'].isin(self.informative_subset)
+            cell_vars_informative_subset = cell_vars[mask_s]
+            informative_subset_sites = len(cell_vars_informative_subset)
+            # print("Informative sites " + str(len(self.informative_sites)))
+            # print("uninformative sites " + str(len(self.uninformative_sites)))
+            # print("informative sites in cell vars " + str(len(cell_vars_informative)))
+            # print("uninformative sites in cell vars " + str(len(cell_vars_uninformative)))
+            # print("Informative subset " + str(informative_subset_sites))
+            # print(cell_vars_informative_subset)
+            # exit(0)
+
+            total_dp = cell_vars['DP'].sum()
+            total_oth = cell_vars['OTH'].sum()
+            total_reads = total_dp + total_oth
+            total_dp_inf = cell_vars_informative['DP'].sum()
+            total_oth_inf = cell_vars_informative['OTH'].sum()
+            total_reads_informative = total_dp_inf + total_oth_inf
+            total_dp_uninf = cell_vars_uninformative['DP'].sum()
+            total_oth_uninf = cell_vars_uninformative['OTH'].sum()
+            total_reads_uninformative = total_dp_uninf + total_oth_uninf
+            total_dp_inf_subset = cell_vars_informative_subset['DP'].sum()
+            total_oth_inf_subset = cell_vars_informative_subset['OTH'].sum()
+            total_reads_informative_subset = total_dp_inf_subset + total_oth_inf_subset
+
+            # expected genotype 0/0
+            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+            hom_ref_sites = set(expected_hom_ref['ids'])
+            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+            cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)]
+            cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)]
+            cell_vars_inf_subset_2 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_ref_sites)]
+            ad_hom_ref = cell_vars2['AD'].sum()
+            oth_hom_ref = cell_vars2['OTH'].sum() 
+            discordant_hom_ref = ad_hom_ref + oth_hom_ref
+            ad_hom_ref_inf = cell_vars_inf_2['AD'].sum()
+            oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() 
+            discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf
+            ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum()
+            oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() 
+            discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf
+            ad_hom_ref_inf_subset = cell_vars_inf_subset_2['AD'].sum()
+            oth_hom_ref_inf_subset = cell_vars_inf_subset_2['OTH'].sum()
+            discordant_hom_ref_informative_subset = ad_hom_ref_inf_subset + oth_hom_ref_inf_subset 
+
+            # expected genotype 0/1 or 1/0
+            hets = ['0/1', '1/0']
+            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+            het_sites = set(expected_het['ids'])
+            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+            cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)]
+            cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)]
+            cell_vars_inf_subset_3 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(het_sites)]
+            discordant_het = cell_vars3['OTH'].sum()
+            discordant_het_informative = cell_vars_inf_3['OTH'].sum()
+            discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum()
+            discordant_het_informative_subset = cell_vars_inf_subset_3['OTH'].sum()
+
+            # expected genotype 1/1
+            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+            hom_alt_sites = set(expected_hom_alt['ids'])
+            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+            cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)]
+            cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)]
+            cell_vars_inf_subset_4 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_alt_sites)]
+            # DP + OTH - AD
+            ad_hom_alt = cell_vars4['AD'].sum()
+            dp_hom_alt = cell_vars4['DP'].sum()
+            oth_hom_alt = cell_vars4['OTH'].sum()
+            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+            ad_hom_alt_inf = cell_vars_inf_4['AD'].sum()
+            dp_hom_alt_inf = cell_vars_inf_4['DP'].sum()
+            oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum()
+            discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf
+            ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum()
+            dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum()
+            oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum()
+            discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf
+            ad_hom_alt_inf_subset = cell_vars_inf_subset_4['AD'].sum()
+            dp_hom_alt_inf_subset = cell_vars_inf_subset_4['DP'].sum()
+            oth_hom_alt_inf_subset = cell_vars_inf_subset_4['OTH'].sum()
+            discordant_hom_alt_informative_subset = (dp_hom_alt_inf_subset + oth_hom_alt_inf_subset) - ad_hom_alt_inf_subset
+
+            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+            discordant_reads_informative =  discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative
+            discordant_reads_uninformative =  discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative
+            discordant_reads_informative_subset = discordant_hom_ref_informative_subset + discordant_het_informative_subset + discordant_hom_alt_informative_subset
+
+            return total_sites, self.informative_covered, self.uninformative_covered, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset
+        
+        
+
+        def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
+            # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+            # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+            # Author: M.Ozols
+            
+            cell_vars_norm = self.norm_genotypes(cell_vars)
+
+            if len(cell_vars_norm) > 0:
+                Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+                expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+                cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
+                # print(cell_vars_norm)
+                # print(expected_vars2)
+                # print(cell_vars2)
+                # exit(0)
+                Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+                Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
+                disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+                df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+                disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+                # print(len(disc2))
+                # exit(0)
+                disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+                disc_sites = ';'.join(disc2['expected_retrieved'])
+                #find truly discordant sites
+                #true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
+                true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(expected_vars2[0], cell_vars2[0])
+                #find discordant reads
+                total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset = self.read_condordance(expected_vars2, cell_vars2)
+            else:
+                Total_Overlapping_sites = set()
+                Concordant_Sites = set()
+                Discordant_sites = set()
+                disc_sites = ''
+                true_discordant_count = 0
+                relaxed_concordant_count = 0
+                total_sites = 0
+                discordant_reads = 0
+
+                informative_subset_sites = 0
+                subset_informative_sites_concordant_count = 0
+                subset_informative_sites_discordant_count = 0
+                total_reads_informative_subset = 0
+                discordant_reads_informative_subset = 0
+                relaxed_concordant_informative_count = 0
+                relaxed_concordant_uninformative_count = 0
+                true_discordant_informative_count = 0
+                true_discordant_uninformative_count = 0
+                total_reads = 0
+                total_reads_informative = 0
+                total_reads_uninformative = 0
+                discordant_reads = 0
+                discordant_reads_informative = 0
+                discordant_reads_uninformative = 0
+                informative_sites = 0
+                uninformative_sites = 0
+                
+            #print(total_sites, informative_sites, uninformative_sites,  relaxed_concordant_informative_count, true_discordant_informative_count, self.informative_covered, self.uninformative_covered)
+            #exit(0)
+
+            return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset
+        
+
+        def set_results(self,to_set,id):
+            # Recod to disk to save the loading mmeory time.
+            with open(f'tmp_{id}.pkl', 'wb') as f:
+                pickle.dump(to_set, f)
+            self.record_dict[id]=f'tmp_{id}.pkl'
+        
+        def append_results_cell_concordances(self,result):
+            count=result[13]
+            try:
+                percent_concordant = result[2]/(result[3]+result[2])*100
+            except:
+                percent_concordant = 0
+            
+            try:
+                percent_discordant = result[3]/(result[3]+result[2])*100
+            except:
+                percent_discordant = 0
+
+            try:
+                percent_relaxed_concordant = result[4]/(result[4]+result[5])*100
+            except:
+                percent_relaxed_concordant = 0
+            
+            try:
+                percent_strict_discordant = result[5]/(result[4]+result[5])*100
+            except:
+                percent_strict_discordant = 0
+
+            try:
+                read_discordance = result[21]/result[15]
+            except:
+                read_discordance = 0
+
+            donor = result[1]
+            cohort = 'UNKNOWN'
+            donor_split = donor.split("_")
+            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+                cohort = 'UKB'
+            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+                cohort = 'ELGH'
+
+            print(count)
+            self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
+                                                                    'GT 2':result[1],
+                                                                    'cohort': cohort,
+                                                                    'Nr_Concordant':result[2],
+                                                                    'Nr_Discordant':result[3],
+                                                                    'Nr_Relaxed_concordant':result[4],
+                                                                    'Nr_strict_discordant':result[5],
+                                                                    'Percent Concordant':percent_concordant,
+                                                                    'Percent Discordant':percent_discordant,
+                                                                    'Percent_relaxed_concordant': percent_relaxed_concordant,
+                                                                    'Percent_strict_discordant': percent_strict_discordant,
+                                                                    'Nr_concordant_informative': result[6],
+                                                                    'Nr_concordant_uninformative': result[7],
+                                                                    'Nr_discordant_informative': result[8],
+                                                                    'Nr_discordant_uninformative': result[9],
+                                                                    'NrTotal_Overlapping_sites_between_two_genotypes':result[10],
+                                                                    'Nr_donor_distinct_sites_within_pool_individuals':result[12],
+                                                                    'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11],
+                                                                    'Discordant_Site_Identities':result[14],
+                                                                    'Total_sites': result[15],
+                                                                    'Total_informative_sites': result[16],
+                                                                    'Total_uninformative_sites': result[17],
+                                                                    'Total_reads': result[18],
+                                                                    'Total_reads_informative': result[19],
+                                                                    'Total_reads_uninformative': result[20],
+                                                                    'Discordant_reads': result[21],
+                                                                    'Discordant_reads_informtive': result[22],
+                                                                    'Discordant_reads_uninformtive': result[23],
+                                                                    'Discordant_reads_by_n_sites': read_discordance,
+                                                                    'informative_subset_sites': result[24],
+                                                                    'subset_informative_sites_concordant_count': result[25],
+                                                                    'subset_informative_sites_discordant_count': result[26],
+                                                                    'total_reads_informative_subset': result[27],
+                                                                    'discordant_reads_informative_subset': result[28]
+                                                                    }   
+    #informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset]        
+            if (count % 200 == 0):
+                print(f'recording and resetting memory {count}')
+                # self.record_dict[count]=self.exclusive_donor_variants
+                self.set_results(self.cell_concordance_table,count)
+                self.reset()  
+            _=""
+        
+        def combine_written_files(self):#this one is for concordance class
+            to_export = self.cell_concordance_table
+            for val1 in self.record_dict.values():
+                # here remove the int files.
+                print(f"merging temp file: {val1}")
+                with open(val1, 'rb') as f:
+                    loaded_dict = pickle.load(f)
+                    for k1 in loaded_dict.keys():
+                        to_export[k1]=loaded_dict[k1]
+                os.remove(val1)
+            return to_export
+        
+        
+        def conc_table(self):
+            donor_assignments_table=self.donor_assignments_table
+            cell_assignments_table=self.cell_assignments_table
+            exclusive_don_variants=self.exclusive_don_variants
+            exclusive_cell_variants= self.exclusive_cell_variants
+            
+            pool = mp.Pool(cpus)
+            count = 0
+            for i,row1 in donor_assignments_table.iterrows():
+                donor_in_question = row1['donor_query']
+                donor_gt_match = row1['donor_gt']
+                if (donor_gt_match=='NONE'):
+                    continue
+                Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
+                try:
+                    expected_vars = exclusive_don_variants[donor_gt_match]
+                except:
+                    _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances'
+                    continue
+                expected_vars_norm = self.norm_genotypes(expected_vars)
+                try:
+                    # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                    dds = self.donor_distinct_sites[donor_gt_match]
+                except:
+                    continue
+                
+                for cell1 in Cells_to_keep_pre:
+                    count+=1
+                    # if count>800:
+                    #     break
+                    cell_vars = exclusive_cell_variants[cell1]
+                    # cell_vars_dp = exclusive_cell_variants_dp[cell1]
+
+                    self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
+                    # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
+                    result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count)
+                    self.append_results_cell_concordances(result1)
+                    
+            pool.close()
+            pool.join()
+            output = self.combine_written_files()
+            return output
+        
+        def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
+            Nr_donor_distinct_sites = len(dds)
+            Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+            Nr_Concordant = len(Concordant_Sites)
+            #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
+            Nr_Discordant = len(Discordant_sites)
+            Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
+            Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
+            #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+
+            return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,relaxed_concordant_count, true_discordant_count, relaxed_concordant_informative_count, 
+                    relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
+                    Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, 
+                    uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative,
+                    informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset]
+        
+        
+class VCF_Loader:
+    
+    def __init__(self, vcf_file, biallelic_only=True,
+                        sparse=False, format_list=['GT']):
+        self.vcf_file = vcf_file
+        self.load_sample = True
+        self.biallelic_only = biallelic_only
+        self.sparse = sparse
+        self.record_dict={}
+        self.reset()
+        self.format_list = format_list
+        self.exclusive_donor_variants = {}
+        self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
+        self.last_count=-1
+        self.reset_c()
+    
+    def reset_c(self):
+        self.record_times=0
+        
+    def reset(self):
+        self.exclusive_donor_variants ={}
+                
+    def myfunc(self):
+        print(f"Hello my name is {self.biallelic_only}" )
+        
+    def load_sample_mp(self,line,obs_ids,count,format_list):
+        '''
+        takes VCF lines and extracts all format fields for those where GT !='.'
+        '''
+        list_val = line.rstrip().split("\t") #[:5] #:8
+        idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called
+        if len(list_val[3]) > 1 or len(list_val[4]) > 1:
+            # CURRENTLY DEALS ONLY WITH BIALELIC
+            print(f'{idx} var not bialelic')
+        elif list_val[3] == 'A' and list_val[4] == 'G':#remove A>G
+            pass
+        elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C
+            
+            pass
+        else:
+            list_val2 = list_val[9:]
+            obs = pd.DataFrame(obs_ids)
+            lv = pd.DataFrame(list_val2)
+            lv_proc =lv[0].str.split(':').str[idx]
+            gt_exists = lv_proc[lv_proc != '.']
+            idx2 = gt_exists.index
+            obs_with_gt = obs.loc[idx2.values]
+            obs_with_gt = list(obs_with_gt[0].values)
+            list_val_with_gt = lv.loc[idx2.values]
+            list_val_with_gt = list(list_val_with_gt[0].values)
+            random.seed(count)
+            c = list(zip(obs_with_gt, list_val_with_gt))
+            random.shuffle(c)
+            obs_with_gt, list_val_with_gt = zip(*c)
+            # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count])
+
+        return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step
+
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+    
+
+    def append_results(self,result):
+        # exclusive_donor_variants
+        obs_with_gt= result[0]
+        list_val_with_gt= result[1]
+        idx = result[2]
+        list_val = result[3]
+        count = result[4]
+        format_list = result[5]#list of required format fields
+        #get indexes of required format fields (apart from GT which has already been taken care of)
+        additional_field_idxs = []
+        for fmt in format_list:
+            if not fmt == 'GT':
+                idx_addn = find(list_val[8].split(':'), fmt)[0]
+                additional_field_idxs.append(idx_addn)
+        # print(additional_field_idxs)
+        # exit(0)
+
+        count11=0
+        # r = random.random()
+        # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer.
+        # every 500 itterations we push the data to a dictionary, later we combine these together.
+        if (count % 200 == 0):
+            print(f'recording and resetting memory {count}')
+            # self.record_dict[count]=self.exclusive_donor_variants
+            self.set_results(self.exclusive_donor_variants,count)
+            self.reset()  
+            self.reset_c()        
+        
+        for ob_id in obs_with_gt:
+            donor_loc_in_list = count11
+            alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx]
+            #append any additional format fields to alleles
+            if len(additional_field_idxs) > 0:
+                for idx_addnl in additional_field_idxs:
+                    fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl]
+                    alleles = alleles + '_' + fmt_val
+
+            if not alleles.startswith('.'):
+                ids = "_".join([list_val[x] for x in [0, 1, 3, 4]])
+                donor_var = f"{ids}_{alleles}"
+                while ob_id in self.curently_pushing:
+                    time.sleep(r*0.01)
+                self.curently_pushing.append(ob_id)           
+                try:
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                except:
+                    self.exclusive_donor_variants[ob_id]=set()
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                self.curently_pushing.remove(ob_id)
+                # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1']
+            count11+=1 
+
+    def combine_written_files(self):#this is for VCF loader class
+        to_export = self.exclusive_donor_variants
+        for val1 in self.record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    try:
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+                    except:
+                        to_export[k1]=set()
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+            os.remove(val1)
+        return to_export
+    
+    
+    def load_VCF_batch_paralel(self):
+        """
+        Load whole VCF file by utilising multiple cores to speed up loading of large cell files
+        -------------------
+        Initially designed to load VCF from cellSNP output, requiring 
+        1) all variants have the same format list;
+        2) a line starting with "#CHROM", with sample ids.
+        If these two requirements are satisfied, this function also supports general
+        VCF files, e.g., genotype for multiple samples.
+
+        Note, it may take a large memory, please filter the VCF with bcftools first.
+        """
+        
+        vcf_file = self.vcf_file
+        biallelic_only = self.biallelic_only
+        load_sample= self.load_sample
+        sparse = self.sparse
+        format_list= self.format_list
+        pool = mp.Pool(cpus)
+        
+        
+        import time
+        if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz":
+            infile = gzip.open(vcf_file, "rb")
+            is_gzip = True
+        else:
+            infile = open(vcf_file, "r")
+            is_gzip = False
+        
+        FixedINFO = {}
+        contig_lines = []
+        comment_lines = []
+        var_ids, obs_ids, obs_dat = [], [], []
+        count=0 #57077    
+        for line in infile:
+            count+=1
+            # if count>10000:
+            #     break
+            if is_gzip:
+                line = line.decode('utf-8')
+            if line.startswith("#"):
+                if line.startswith("##contig="):
+                    contig_lines.append(line.rstrip())
+                if line.startswith("#CHROM"):
+                    if load_sample:
+                        obs_ids = line.rstrip().split("\t")[9:]
+                        for ob_id in obs_ids:
+                            self.exclusive_donor_variants[ob_id]=set()
+                    key_ids = line[1:].rstrip().split("\t")[:8]
+                    for _key in key_ids:
+                        FixedINFO[_key] = []
+                else:
+                    comment_lines.append(line.rstrip())
+            else:
+                pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results)
+                del line
+        self.last_count=count
+        pool.close()
+        pool.join()
+        
+        output = self.combine_written_files()
+        return output
+
+
+"""Run CLI."""
+
+def get_options():
+    '''
+    Get options from the command line
+    '''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__))
+    parser.add_argument('--cpus', action='store', required=True, type=int)
+    parser.add_argument('--cell_vcf', action='store', required=True)
+    parser.add_argument('--cell_assignments', action='store', required=True)
+    parser.add_argument('--donor_assignments', action='store', required=True)
+    parser.add_argument('--gt_match_vcf', action='store', required=True)
+    parser.add_argument('--expected_vcf', action='store', required=True)
+    parser.add_argument('--informative_sites', action='store', required=True)
+    parser.add_argument('--uninformative_sites', action='store', required=True)
+    parser.add_argument('--outfile', action='store', required=True)
+    args = parser.parse_args()
+
+    return args
+
+
+def get_sites_from_tsv(sites_file):
+    """
+    get sites frm a tsv file where cols are chrom, pos, id, ref, alt
+    assumes no multiallelics
+    """
+    sites = set()
+    with open(sites_file, 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            linedata = l.split('\t')
+            var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
+            sites.add(var)
+    return sites
+    
+
+def find(lst, a):
+    return [i for i, x in enumerate(lst) if x==a ]
+
+
+def norm_genotypes(expected_vars):
+    expected_vars = pd.DataFrame(expected_vars)
+    split_str=expected_vars[0].str.split("_")
+    expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+    expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+    expected_vars['vars'] = split_str.str[4]
+    expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+    expected_vars = expected_vars[expected_vars['vars']!='./.']
+    expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+    expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+    return expected_vars
+
+
+def donor_exclusive_sites(exclusive_don_variants2):
+    # Here we generate a function for determining the sites that are donor exclusive
+    donor_distinct_sites = {}
+    for col1 in exclusive_don_variants2.keys():
+        comparisons =[]
+        to_compare = []
+        for col2 in exclusive_don_variants2.keys():
+            if col1==col2:
+                # we set this as the unique entry
+                # print('1')
+                to_compare = set(exclusive_don_variants2[col2])
+            else:
+                # We combine all the variants in one list
+                comparisons+=list(exclusive_don_variants2[col2])
+                # print('2')
+        # print('comparison')  
+        comparisons_all = set(comparisons) 
+        comparisons_all_norm = norm_genotypes(comparisons_all)
+        comparisons_all=set(comparisons_all_norm['combo'])
+        
+        to_compare = set(to_compare)
+        to_compare_norm = norm_genotypes(to_compare)
+        to_compare=set(to_compare_norm['combo'])
+        # Make sure we account for hap types - phased/unphased 
+        distinct_donor_sites = to_compare - comparisons_all
+        donor_distinct_sites[col1]=distinct_donor_sites
+        # Perform the distinct set function.
+    return donor_distinct_sites
+
+debug=False
+
+if __name__ == "__main__":
+
+    options = get_options()
+    cpus = options.cpus
+    outfile = options.outfile
+    cell_vcf=options.cell_vcf
+    donor_assignments=options.donor_assignments
+    gt_match_vcf=options.gt_match_vcf
+    expected_vcf=options.expected_vcf
+    cell_assignments=options.cell_assignments
+    informative_sites_file = options.informative_sites
+    uninformative_sites_file = options.uninformative_sites
+
+    informative_sites = get_sites_from_tsv(informative_sites_file)
+    uninformative_sites = get_sites_from_tsv(uninformative_sites_file)
+
+    exclusive_donor_variants = {} #This is where results are populated when mp process i used.
+    curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
+    All_Results={}
+    cell_concordance_table = {}
+
+    donor_assignments_table = pd.read_csv(donor_assignments)
+    cell_assignments_table = pd.read_csv(cell_assignments,sep='\t')
+    
+    if debug:
+        with open('tmp_GT_Expected_variants.pkl', 'rb') as f:
+            GT_Expected_variants = pickle.load(f)
+        with open('tmp_GT_Matched_variants.pkl', 'rb') as f:
+            GT_Matched_variants = pickle.load(f)   
+        with open('tmp_exclusive_cell_variants.pkl', 'rb') as f:
+            exclusive_cell_variants = pickle.load(f) 
+        with open('tmp_donor_distinct_sites.pkl', 'rb') as f:
+            donor_distinct_sites = pickle.load(f)  
+        with open('tmp_exclusive_don_variants.pkl', 'rb') as f:
+            exclusive_don_variants = pickle.load(f) 
+    else:  
+        print('---Loading genotype VCF----')   
+        if (os.path.exists(gt_match_vcf)):
+            loader2 = VCF_Loader(gt_match_vcf, biallelic_only=True,
+                            sparse=False, format_list=['GT'])
+            GT_Matched_variants = loader2.load_VCF_batch_paralel()
+            del loader2
+        else:
+            GT_Matched_variants = {}
+        
+        with open(f'tmp_GT_Matched_variants.pkl', 'wb') as f:
+            pickle.dump(GT_Matched_variants, f)
+        
+        print('---Loading cell VCF----')
+        tic = time.perf_counter()
+        loader1 = VCF_Loader(cell_vcf, biallelic_only=True,
+                            sparse=False, format_list=['GT', 'DP', 'AD', 'OTH'])
+        exclusive_cell_variants = loader1.load_VCF_batch_paralel()
+        del loader1
+        toc = time.perf_counter()
+            
+        with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f:
+            pickle.dump(exclusive_cell_variants, f)
+        print(f"Loading took {toc - tic:0.4f} seconds")
+
+        print('---Loading expected VCF----')
+        loader3 = VCF_Loader(expected_vcf, biallelic_only=True,
+                        sparse=False, format_list=['GT'])
+        GT_Expected_variants = loader3.load_VCF_batch_paralel()
+        del loader3
+
+        with open(f'tmp_GT_Expected_variants.pkl', 'wb') as f:
+            pickle.dump(GT_Expected_variants, f)
+ 
+        print('---Variant files loaded----')       
+        
+        exclusive_don_variants = GT_Expected_variants.keys()
+        content = [x for x in exclusive_don_variants if not x.startswith('donor')]
+        GT_Expected_variants = {key: GT_Expected_variants[key] for key in content}
+        
+        exclusive_don_variants = GT_Matched_variants.keys()
+        content = [x for x in exclusive_don_variants if not x.startswith('donor')]
+        GT_Matched_variants = {key: GT_Matched_variants[key] for key in content}
+        
+        exclusive_don_variants = GT_Expected_variants
+        for key in GT_Matched_variants.keys():
+            if key in exclusive_don_variants.keys():
+                _=''
+            else:
+                exclusive_don_variants[key]=GT_Matched_variants[key]
+        
+        with open(f'tmp_exclusive_don_variants.pkl', 'wb') as f:
+            pickle.dump(exclusive_don_variants, f)
+        donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants)
+        with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f:
+            pickle.dump(donor_distinct_sites, f)
+        
+    print('---donor_distinct_sites calculated----')
+    
+    conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites, informative_sites, uninformative_sites)
+    cell_concordance_table = conc1.conc_table()
+    
+    # cell_concordance_table = conc_table(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants)
+    result = pd.DataFrame(cell_concordance_table).T
+
+    if len(result)>0:
+        result.to_csv(outfile,sep='\t')
+    print('Processing Done')
+    

From 3c0ea91a5cf7ce9c73c5627b032ca808ab001d12 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Tue, 14 Nov 2023 17:53:43 +0000
Subject: [PATCH 2/7] combined concordance calculations in one base file

---
 bin/concordance_calculations.py               | 1220 +++++++++++++++++
 ...calculations_donor_exclusive_read_level.py |  598 ++++----
 ...ations_donor_exclusive_read_level_noA2G.py |  735 +++++-----
 ...ance_calculations_subsample_informative.py |  947 ++++++-------
 ...t_sites_in_other_donors_find_best_donor.py |   39 +-
 ..._discordant_sites_in_other_donors_noA2G.py |  715 ++++++----
 6 files changed, 2799 insertions(+), 1455 deletions(-)
 create mode 100644 bin/concordance_calculations.py

diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py
new file mode 100644
index 00000000..8d0c41f2
--- /dev/null
+++ b/bin/concordance_calculations.py
@@ -0,0 +1,1220 @@
+#!/usr/bin/env python3
+
+#take cellSNP VCF and genotype VCF for the donors in a pool
+# for each cell in the cellSNP VCF identify discordant sites (using the relaxed concordance)
+# look for these sites in genotypes of all members of the pool
+# output:
+#   cell id
+#   assigned donor
+#   cohort of assigned donor
+#   number of discordant sites
+#   total AD over discordant sites
+#   list of donors in the pool, how many of the discordant sites are found in the donor, cohort each belongs to
+#   list of discordant sites
+
+__date__ = '2023-14-11'
+__version__ = '0.0.1'
+import argparse
+import sys
+import importlib.util
+import pickle 
+import pandas as pd
+import gzip
+import random
+import numpy as np
+import time
+import multiprocessing as mp
+from multiprocessing import Lock
+import logging
+import os
+import gzip
+import time
+pd.options.mode.chained_assignment = None
+
+class Concordances:
+    
+    def reset(self):
+        self.cell_concordance_table ={}
+
+    def reset2(self):
+        self.other_donor_comp =[] 
+    
+    def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
+        self.reset()
+        self.reset2()
+        self.donor_assignments_table=donor_assignments_table
+        self.cell_assignments_table=cell_assignments_table
+        self.exclusive_don_variants=exclusive_don_variants
+        self.exclusive_cell_variants=exclusive_cell_variants
+        self.donor_distinct_sites=donor_distinct_sites
+        self.informative_sites = informative_sites
+        self.uninformative_sites = uninformative_sites
+        self.record_dict={}
+
+    def norm_genotypes(self,expected_vars):
+        expected_vars = pd.DataFrame(expected_vars)
+        if len(expected_vars) > 0:
+            split_str=expected_vars[0].str.split("_")
+            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+            expected_vars['vars'] = split_str.str[4]
+            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+            expected_vars = expected_vars[expected_vars['vars']!='./.']
+            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+        return expected_vars
+
+
+
+    def get_strict_discordance(self, expected_vars, cell_vars):
+        '''
+        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+        sites and relaxed concordant sites and list of discordant sites1
+        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+        2) if you have a 0/0 you can not get a 1/1 or 0/1
+        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+        So - each obversed cellsnp allele must be in the array SNP gtype
+        '''
+        snp_gtypes = expected_vars[0]
+        cellsnp_gtypes = cell_vars[0]
+        true_discordant = 0
+        relaxed_concordant = 0
+        relaxed_concordant_informative = 0
+        relaxed_concordant_informative_ids = []
+        relaxed_concordant_uninformative_ids = []
+        true_discordant_uninformative_ids =[]
+        true_discordant_informative_ids=[]
+        relaxed_concordant_uninformative = 0
+        true_discordant_informative = 0
+        true_discordant_uninformative = 0
+        discordant_vars = []
+        concordant_vars = []
+        subset_informative_concordant = 0
+        subset_informative_discordant = 0
+
+        #print(self.uninformative_sites)
+        #print(self.informative_sites)
+
+        #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
+        #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
+        # with the same number of vars (at random)
+        split_snp_gts=snp_gtypes.str.split("_")
+        snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
+
+        split_cellsnp_gts=cellsnp_gtypes.str.split("_")
+        cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
+
+        shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
+
+        shared_informative = shared_gts.intersection(self.informative_sites)
+        shared_uninformative = shared_gts.intersection(self.uninformative_sites)
+        # print("shared informative " + str(len(shared_informative)))
+        # print("shared uninformative " + str(len(shared_uninformative)))
+
+        #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
+        #are the sites used for concordance
+        self.informative_covered = len(shared_informative)
+        self.uninformative_covered = len(shared_uninformative)
+
+        if len(shared_uninformative) > 0:
+            #print(len(shared_uninformative))
+            # print(len(shared_informative))
+            if len(shared_uninformative) <= len(shared_informative):
+                informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
+            else:
+                informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
+            # print(informative_subset)
+            # exit(0)
+        else:
+            informative_subset = set()
+
+        # print(informative_subset)
+        self.informative_subset = informative_subset
+
+        snp_gtypes_set = set(snp_gtypes)
+        snp_gtypes_set = sorted(snp_gtypes_set)
+
+        cellsnp_gtypes_set = set(cellsnp_gtypes)
+        cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
+
+        #for i in range(0, len(snp_gtypes)):
+        for i in range(0, len(snp_gtypes_set)):
+            discordant = False
+            # snp_data = snp_gtypes[i].split('_')
+            # cellsnp_data = cellsnp_gtypes[i].split('_')
+            snp_data = snp_gtypes_set[i].split('_')
+            cellsnp_data = cellsnp_gtypes_set[i].split('_')
+
+            # the below will no longer work due to differing length of input strings
+            # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
+            # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
+
+            snp_alleles = [snp_data[4][0], snp_data[4][2]]
+            cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
+
+            snp_alleles_set = set(snp_alleles)
+            cellsnp_alleles_set = set(cellsnp_alleles)
+            
+            snp_var = ('_').join(snp_data[0:4])
+            cellsnp_var = ('_').join(cellsnp_data[0:4])
+
+            if not cellsnp_var == snp_var:
+                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
+                exit(1)
+            else:
+                for allele in cellsnp_alleles_set:
+                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                        discordant = True
+            
+            if discordant == True:
+                true_discordant+=1
+                discordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    true_discordant_uninformative+=1
+                    true_discordant_uninformative_ids.append(snp_var)
+                elif snp_var in self.informative_sites:
+                    true_discordant_informative+=1
+                    true_discordant_informative_ids.append(snp_var)
+            else:
+                relaxed_concordant+=1
+                concordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    relaxed_concordant_uninformative+=1
+                    relaxed_concordant_uninformative_ids.append(snp_var)
+                elif snp_var in self.informative_sites:
+                    relaxed_concordant_informative+=1
+                    relaxed_concordant_informative_ids.append(snp_var)
+            if len(shared_uninformative) > 0:
+                if snp_var in informative_subset:
+                    if discordant == True:
+                        subset_informative_discordant+=1
+                    else:
+                        subset_informative_concordant+=1
+            #    true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count 
+        cell_vars2 = cell_vars.set_index('ids') 
+        disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x'])
+        df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos')
+        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+        disc_sites_string = ';'.join(disc2['expected_retrieved'])
+        return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string
+    
+    def read_concordance_calc(self,expected_vars,cell_vars):
+            
+            # This is a wrapper to add up the discordant reads in the cellsnp file.
+            
+            # expected genotype 0/0
+            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+            hom_ref_sites = set(expected_hom_ref['ids'])
+            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+            ad_hom_ref = cell_vars2['AD'].sum()
+            oth_hom_ref = cell_vars2['OTH'].sum() 
+            discordant_hom_ref = ad_hom_ref + oth_hom_ref
+                    
+            # expected genotype 0/1 or 1/0
+            hets = ['0/1', '1/0']
+            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+            het_sites = set(expected_het['ids'])
+            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+            discordant_het = cell_vars3['OTH'].sum()
+            
+            # expected genotype 1/1
+            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+            hom_alt_sites = set(expected_hom_alt['ids'])
+            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+            # DP + OTH - AD
+            ad_hom_alt = cell_vars4['AD'].sum()
+            dp_hom_alt = cell_vars4['DP'].sum()
+            oth_hom_alt = cell_vars4['OTH'].sum()
+            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+            
+            # Total analysis
+            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+            total_dp = cell_vars['DP'].sum()
+            total_oth = cell_vars['OTH'].sum()
+            total_reads = total_dp + total_oth
+            
+            return total_reads,total_dp,total_oth,discordant_reads
+
+    def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars):
+        '''
+        get read level concordance using DP, AD and OTH format fields
+        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+        '''
+        if not len(expected_vars) == len(cell_vars):
+            print("length mismatch between expected vars and cell vars")
+            exit(1)
+
+        total_sites = len(expected_vars)
+        #add cols for DP, AD< OTH
+        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        
+
+        
+        # Total
+        total_reads,total_dp,total_oth,discordant_reads = self.read_concordance_calc(expected_vars,cell_vars)
+        
+        # uninformative
+        cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)]
+        total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative = self.read_concordance_calc(expected_vars,cell_vars_uninformative)
+        
+        # informative 
+        cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)]
+        total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative = self.read_concordance_calc(expected_vars,cell_vars_informative)
+        
+        # Split into concordant and discordant sites
+        # concordant
+        concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))]
+        total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites = self.read_concordance_calc(expected_vars,concordant_sites)
+        
+        # discordant
+        discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))]
+        total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites = self.read_concordance_calc(expected_vars,discordant_sites)
+              
+        # Subset analysis
+        cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)]
+        total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset = self.read_concordance_calc(expected_vars,cell_vars_informative_subset)
+  
+        return total_sites, \
+            self.informative_covered, \
+                self.uninformative_covered, \
+                    total_reads, \
+                        discordant_reads, \
+                            total_reads_informative, \
+                                discordant_reads_informative, \
+                                    total_reads_uninformative, \
+                                        discordant_reads_uninformative, \
+                                            total_reads_informative_subset, \
+                                                    discordant_reads_informative_subset, \
+                                                        total_reads_for_concordant_sites, \
+                                                            discordant_reads_for_concordant_sites, \
+                                                                total_reads_for_discconcordant_sites, \
+                                                                    discordant_reads_for_discconcordant_sites
+                                                                    
+                                                    
+    
+    def get_discordance(self,expected_vars2,cell_vars2):
+        Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+        Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])  
+        disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+        df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+        disc_sites = ';'.join(disc2['expected_retrieved'])
+        return Concordant_Sites,Discordant_sites,disc_sites
+    
+    
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
+        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+        # Author: M.Ozols
+        
+        cell_vars_norm = self.norm_genotypes(cell_vars)
+
+        if len(cell_vars_norm) > 0:
+            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            
+            # Find exact discordant sites
+            Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2)
+            #find truly discordant sites
+            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2)
+            #find discordant reads
+            total_sites, \
+                informative_sites, \
+                    uninformative_sites, \
+                        total_reads, \
+                            discordant_reads, \
+                                total_reads_informative, \
+                                    discordant_reads_informative, \
+                                        total_reads_uninformative, \
+                                            discordant_reads_uninformative, \
+                                                    total_reads_informative_subset, \
+                                                        discordant_reads_informative_subset, \
+                                                            total_reads_for_concordant_sites, \
+                                                                discordant_reads_for_concordant_sites, \
+                                                                    total_reads_for_discconcordant_sites, \
+                                                                        discordant_reads_for_discconcordant_sites   = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars)
+                                                                        
+            discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}"
+            discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}"
+            discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}"
+            discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}"
+            
+            # sanity checks
+            if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites:
+                print("Error: total reads dont add up ")
+                exit(1)
+            if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites:
+                print("Error: discordant reads dont add up ")
+                exit(1)
+            
+
+        else:
+            Total_Overlapping_sites = set()
+            Concordant_Sites = set()
+            Discordant_sites = set()
+            disc_sites = ''
+            true_discordant_count = 0
+            relaxed_concordant_count = 0
+            total_sites = 0
+    
+            discordant_reads = 0
+
+        return Concordant_Sites, \
+            Discordant_sites, \
+            Total_Overlapping_sites, \
+            disc_sites_string, \
+            cell_vars_norm, \
+            true_discordant_count, \
+            relaxed_concordant_count, \
+            relaxed_concordant_informative_count, \
+            relaxed_concordant_uninformative_count, \
+            true_discordant_informative_count, \
+            true_discordant_uninformative_count, \
+            total_sites, \
+            informative_sites, \
+            uninformative_sites, \
+            total_reads, \
+            total_reads_informative, \
+            total_reads_uninformative, \
+            discordant_reads, \
+            discordant_reads_informative, \
+            discordant_reads_uninformative,  \
+            discordant_vars, \
+            concordant_vars, \
+            discordant_read_fraction_in_concordant_sites, \
+            discordant_read_fraction_in_discordant_sites, discordant_reads_uninformative_fraction, discordant_reads_informative_fraction
+    
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+        return 
+    
+    # def append_results_cell_concordances(self,result):
+    def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table):
+        other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances
+        count=result['count']
+        try:
+            percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
+        except:
+            percent_concordant = 0
+        
+        try:
+            percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
+        except:
+            percent_discordant = 0
+
+        try:
+            percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
+        except:
+            percent_relaxed_concordant = 0
+        
+        try:
+            percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
+        except:
+            percent_strict_discordant = 0
+
+        try:
+            read_discordance = result['discordant_reads']/result['total_sites']
+        except:
+            read_discordance = 0
+
+        cohort = 'UNKNOWN'
+        donor_split = result['donor_gt_match'].split("_")
+        if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+            cohort = 'UKB'
+        elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+            cohort = 'ELGH'
+
+        same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance']
+        if not same_as_asigned_donor:
+            same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance']
+            
+        cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'],
+                                                                'GT 2':result['donor_gt_match'],
+                                                                'cohort': cohort,
+                                                                
+                                                                'Nr_Concordant':result['Nr_Concordant'],
+                                                                'Nr_Discordant':result['Nr_Discordant'],
+                                                                'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'],
+                                                                'Nr_strict_discordant':result['true_discordant_count'],
+                                                                'Percent Concordant':percent_concordant,
+                                                                'Percent Discordant':percent_discordant,
+                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
+                                                                'Percent_strict_discordant': percent_strict_discordant,
+                                                                'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']),
+                                                                'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']),
+                                                                'Nr_discordant_informative': len(result['true_discordant_informative_count']),
+                                                                'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']),
+                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'],
+                                                                'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'],
+                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'],
+                                                                'Total_sites': result['total_sites'],
+                                                                'Total_informative_sites': result['informative_sites'],
+                                                                'Total_uninformative_sites': result['uninformative_sites'],
+                                                                'Total_reads': result['total_reads'],
+                                                                'Total_reads_informative': result['total_reads_informative'],
+                                                                'Total_reads_uninformative': result['total_reads_uninformative'],
+                                                                'Discordant_reads': result['discordant_reads'],
+                                                                'Discordant_reads_informtive': result['discordant_reads_informative'],
+                                                                'Discordant_reads_uninformtive': result['discordant_reads_uninformative'],
+                                                                'Discordant_reads_by_n_sites': read_discordance,
+                                                                
+                                                                'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']),
+                                                                'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'],
+                                                                'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'],
+                                                                'Concordant_Site_Identities':result['Concordant_Site_Identities'],
+                                                                'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'],
+                                                                'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'],
+                                                                'same_as_asigned_donor':same_as_asigned_donor,
+                                                                'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'],
+                                                                'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'],
+                                                                'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], 
+                                                                'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], 
+                                                                'Discordant_Site_Identities':result['discordant_sites'],
+                                                                } 
+        
+        return [cell_concordance_table,other_donor_concordance_table]
+    
+    # def combine_written_files(self):#this one is for concordance class
+    #     to_export = self.cell_concordance_table
+    #     for val1 in self.record_dict.values():
+    #         # here remove the int files.
+    #         print(f"merging temp file: {val1}")
+    #         with open(val1, 'rb') as f:
+    #             loaded_dict = pickle.load(f)
+    #             for k1 in loaded_dict.keys():
+    #                 to_export[k1]=loaded_dict[k1]
+    #         os.remove(val1)
+    #     return to_export
+    
+    
+    def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
+        to_export = exclusive_donor_variants
+        for val1 in record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                self.other_donor_comp = self.other_donor_comp+ loaded_dict
+            os.remove(val1)
+        return self.other_donor_comp    
+    
+    def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
+        to_export = exclusive_donor_variants
+        for val1 in record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    try:
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+                    except:
+                        to_export[k1]=set()
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+            os.remove(val1)
+        return to_export
+   
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl' 
+    
+    def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm):
+        donor_concordance_table = {}
+        other_donor_concordance_table = []
+        for cell1 in Cells_to_keep_pre:
+            count+=1
+
+            cell_vars = exclusive_cell_variants[cell1]
+            result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data)
+            cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table)
+            if count>300:
+                break
+            # here we should write these independently to the files
+            if (count % 50 == 0):
+                self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}")
+                other_donor_concordance_table = []
+                
+
+        self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}")
+        output2 = self.combine_written_lists(self.other_donor_comp,self.record_dict)
+        pd.DataFrame(output2).sort_values(by=['cell']).to_csv(f'{donor_gt_match}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
+        del output2
+        return donor_concordance_table
+
+    def combine_concordances(self,result):
+        
+        self.cell_concordance_table = {**self.cell_concordance_table, **result}
+        
+
+    def conc_table(self):
+        donor_assignments_table=self.donor_assignments_table
+        cell_assignments_table=self.cell_assignments_table
+        exclusive_don_variants=self.exclusive_don_variants
+        exclusive_cell_variants= self.exclusive_cell_variants
+        donor_list = exclusive_don_variants.keys()
+        pool = mp.Pool(cpus)
+        count = 0
+        
+        
+        #create a list of variants that are on each donor genotype file
+        vars_per_donor_gt = {}
+        for don_id in donor_list:
+            donor_gt_vars = list(exclusive_don_variants[don_id])
+            donor_gt_vars = pd.DataFrame(donor_gt_vars)
+            donor_gt_vars = self.norm_genotypes(donor_gt_vars)
+            donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0']
+            donor_gt_varids = list(donor_gt_vars['ids'])
+            vars_per_donor_gt[don_id] = donor_gt_varids
+        
+        #work out what cohort each donor belongs to
+        donor_cohorts = {}
+        for don_id in donor_list:
+            cohort = 'UNKNOWN'
+            donor_split = don_id.split("_")
+            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+                cohort = 'UKB'
+            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+                cohort = 'ELGH'
+            donor_cohorts[don_id] = cohort
+
+        all_donor_data={}
+        # here we calvculate all the expected donor datasets
+        for row1 in exclusive_don_variants.keys():
+            # donor_in_question = row1['donor_query']
+            donor_gt_match = row1
+            expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match]
+            expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
+            all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
+
+        for i,row1 in donor_assignments_table.iterrows():
+            donor_in_question = row1['donor_query']
+            donor_gt_match = row1['donor_gt']
+            # if i>4:
+            #     continue
+            if (donor_gt_match=='NONE'):
+                continue
+            try:
+                donor_gt_match_cohort = donor_cohorts[donor_gt_match]
+            except:
+                continue
+            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
+            expected_vars = exclusive_don_variants[donor_gt_match]
+            expected_vars_norm = self.norm_genotypes(expected_vars)
+            try:
+                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                dds = self.donor_distinct_sites[donor_gt_match]
+            except:
+                continue
+            if cpus==1:
+                result = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm)
+                self.combine_concordances(result)
+            else:
+                pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances)          
+                
+        pool.close()
+        pool.join()
+        
+        # output = self.combine_written_files(self.cell_concordance_table,self.record_dict)
+        
+        return self.cell_concordance_table
+
+    
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data):
+
+        Concordant_Sites, \
+            Discordant_sites, \
+            Total_Overlapping_sites, \
+            discordant_sites, \
+            cell_vars_norm, \
+            true_discordant_count, \
+            relaxed_concordant_count, \
+            relaxed_concordant_informative_count, \
+            relaxed_concordant_uninformative_count, \
+            true_discordant_informative_count, \
+            true_discordant_uninformative_count, \
+            total_sites, \
+            informative_sites, \
+            uninformative_sites, \
+            total_reads, \
+            total_reads_informative, \
+            total_reads_uninformative, \
+            discordant_reads, \
+            discordant_reads_informative, \
+            discordant_reads_uninformative, \
+            discordant_vars, \
+            concordant_vars, \
+            discordant_read_fraction_in_concordant_sites, \
+            discordant_read_fraction_in_discordant_sites, \
+            discordant_reads_uninformative_fraction, \
+            discordant_reads_informative_fraction  = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+            
+        total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count
+        dds = self.donor_distinct_sites[donor_gt_match]
+        Nr_donor_distinct_sites = len(dds)
+        Nr_Concordant = len(Concordant_Sites)
+        Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
+        Nr_Discordant = len(Discordant_sites)
+        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
+        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites)))
+        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+        #Quantify donor variation in other donors
+        discordant_vars_in_pool = []
+        donor_table_of_concordances = []
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        for donor in vars_per_donor_gt:
+          
+            expected_vars_norm_of_other_donor = all_donor_data[donor]
+            
+            Concordant_Sites_otherDonor, \
+                Discordant_sites_otherDonor, \
+                Total_Overlapping_sites_otherDonor, \
+                discordant_sites_otherDonor, \
+                cell_vars_norm_otherDonor, \
+                true_discordant_count_otherDonor, \
+                relaxed_concordant_count_otherDonor, \
+                relaxed_concordant_informative_count_otherDonor, \
+                relaxed_concordant_uninformative_count_otherDonor, \
+                true_discordant_informative_count_otherDonor, \
+                true_discordant_uninformative_count_otherDonor, \
+                total_sites_otherDonor, \
+                informative_sites_otherDonor, \
+                uninformative_sites_otherDonor, \
+                total_reads_otherDonor, \
+                total_reads_informative_otherDonor, \
+                total_reads_uninformative_otherDonor, \
+                discordant_reads_otherDonor, \
+                discordant_reads_informative_otherDonor, \
+                discordant_reads_uninformative_otherDonor, \
+                discordant_vars_otherDonor, \
+                concordant_vars_otherDonor, \
+                discordant_read_fraction_in_concordant_sites_otherDonor, \
+                discordant_read_fraction_in_discordant_sites_otherDonor, \
+                discordant_reads_uninformative_fraction_otherDonor, \
+                discordant_reads_informative_fraction_otherDonor  = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars)
+            
+            # here we also need to know :
+            #   how many reads of the desired donor discordant sites could be yielded
+   
+
+            total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor
+            concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100
+            discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100
+            DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor))
+            Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor))
+            DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor))
+
+            # We now count the concordant reads  that may contribute to  particular cell at this cell.
+            # to do this we take the discordant sites that have been deamed to be concordant with the other donor and quantify the reads thta are concordant.
+            Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
+            expected_vars2 = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]        
+            cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
+            cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
+            cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
+            
+            total_reads_for_discordant_sites_that_are_concordant_with_other_donor,total_dp_for_discordant_sites_that_are_concordant_with_other_donor,total_oth_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_concordance_calc(expected_vars2,cell_vars2)
+            concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = total_reads_for_discordant_sites_that_are_concordant_with_other_donor - discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
+            
+            try:
+                donor_cohort = donor_cohorts[donor]
+                donor_vars = vars_per_donor_gt[donor]
+            except:
+                continue            
+            if not donor == donor_gt_match:
+                # We want to kow how many of these discordant site
+
+                total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor))
+                # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool.
+                # the ones that havent, i have to add the reads up for them.
+                informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor))
+                
+                total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor))
+                
+
+                common_vars = list(set(discordant_vars) & set(donor_vars))
+                common_var_count = str(len(common_vars))
+                donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
+                discordant_vars_in_pool.append(donor_cohort_common)
+                
+                # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment.
+                # We do this to investigate the potential of a cell coming from this other donor.
+            
+            donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \
+                                                'gt matched donor':donor == donor_gt_match, \
+                                                'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \
+                                                'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \
+                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \
+                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \
+                                                'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \
+                                                'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \
+                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
+                                                'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \
+                                                'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \
+                                                'total_sites_otherDonor':total_sites_otherDonor, \
+                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
+                                                'total_reads_otherDonor':total_reads_otherDonor, \
+                                                'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \
+                                                'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
+                                                'concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
+                                                })
+                
+        discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)
+        concordant_vars_in_pool_str = (";").join(concordant_vars)
+        DF = pd.DataFrame(donor_table_of_concordances)
+        
+        Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values)
+        Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
+        
+        Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values)
+        Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0]
+        Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values)
+        Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values)
+                                                                     
+        return [{
+            'cell1':cell1,
+            'donor_gt_match':donor_gt_match,
+            'Nr_Concordant':Nr_Concordant,
+            'Nr_Discordant':Nr_Discordant,
+            'Nr_Relaxed_concordant':Nr_Relaxed_concordant,
+            'true_discordant_count':true_discordant_count,
+            'relaxed_concordant_informative_count':relaxed_concordant_informative_count,
+            'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count,
+            'true_discordant_informative_count':true_discordant_informative_count,
+            'true_discordant_uninformative_count':true_discordant_uninformative_count,
+            'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites,
+            'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive,
+            'Nr_donor_distinct_sites':Nr_donor_distinct_sites,
+            'count':count,
+            'discordant_sites':discordant_sites,
+            'total_sites':total_sites,
+            'informative_sites':informative_sites,
+            'uninformative_sites':uninformative_sites,
+            'total_reads':total_reads,
+            'total_reads_informative':total_reads_informative,
+            'total_reads_uninformative':total_reads_uninformative,
+            'discordant_reads':discordant_reads,
+            'discordant_reads_informative':discordant_reads_informative,
+            'discordant_reads_uninformative':discordant_reads_uninformative,
+            'Discordant_sites_in_pool': discordant_vars,
+            'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors,
+            'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance,
+            'Concordant_Site_Identities':concordant_vars_in_pool_str,
+            'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance,
+            'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors,
+            'Total_sites_other_donor':Total_sites_other_donor,
+            'Total_reads_other_donor':Total_reads_other_donor,
+            'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}",
+            'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}",
+            'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \
+            'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites
+        }, donor_table_of_concordances]
+    
+    
+class VCF_Loader:
+    
+    def __init__(self, vcf_file, biallelic_only=True,
+                        sparse=False, format_list=['GT']):
+        self.vcf_file = vcf_file
+        self.load_sample = True
+        self.biallelic_only = biallelic_only
+        self.sparse = sparse
+        self.record_dict={}
+        self.reset()
+        self.format_list = format_list
+        self.exclusive_donor_variants = {}
+        self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
+        self.last_count=-1
+        self.reset_c()
+    
+    def reset_c(self):
+        self.record_times=0
+        
+    def reset(self):
+        self.exclusive_donor_variants ={}
+                
+    def myfunc(self):
+        print(f"Hello my name is {self.biallelic_only}" )
+        
+    def load_sample_mp(self,line,obs_ids,count,format_list):
+        '''
+        takes VCF lines and extracts all format fields for those where GT !='.'
+        '''
+        list_val = line.rstrip().split("\t") #[:5] #:8
+        idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called
+        if remove_AG:
+            if list_val[3] == 'A' and list_val[4] == 'G':#remove A>G
+                pass
+            elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C
+                pass
+        
+        if len(list_val[3]) > 1 or len(list_val[4]) > 1:
+            # CURRENTLY DEALS ONLY WITH BIALELIC
+            print(f'{idx} var not bialelic')
+            pass
+        else:
+            list_val2 = list_val[9:]
+            obs = pd.DataFrame(obs_ids)
+            lv = pd.DataFrame(list_val2)
+            lv_proc =lv[0].str.split(':').str[idx]
+            gt_exists = lv_proc[lv_proc != '.']
+            idx2 = gt_exists.index
+            obs_with_gt = obs.loc[idx2.values]
+            obs_with_gt = list(obs_with_gt[0].values)
+            list_val_with_gt = lv.loc[idx2.values]
+            list_val_with_gt = list(list_val_with_gt[0].values)
+            random.seed(count)
+            c = list(zip(obs_with_gt, list_val_with_gt))
+            random.shuffle(c)
+            obs_with_gt, list_val_with_gt = zip(*c)
+            # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count])
+
+        return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step
+
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+        
+    
+
+    def append_results(self,result):
+        # exclusive_donor_variants
+        obs_with_gt= result[0]
+        list_val_with_gt= result[1]
+        idx = result[2]
+        list_val = result[3]
+        count = result[4]
+        format_list = result[5]#list of required format fields
+        #get indexes of required format fields (apart from GT which has already been taken care of)
+        additional_field_idxs = []
+        for fmt in format_list:
+            if not fmt == 'GT':
+                idx_addn = find(list_val[8].split(':'), fmt)[0]
+                additional_field_idxs.append(idx_addn)
+        # print(additional_field_idxs)
+        # exit(0)
+
+        count11=0
+        # r = random.random()
+        # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer.
+        # every 500 itterations we push the data to a dictionary, later we combine these together.
+        if (count % 200 == 0):
+            print(f'recording and resetting memory {count}')
+            # self.record_dict[count]=self.exclusive_donor_variants
+            self.set_results(self.exclusive_donor_variants,count)
+            self.reset()  
+            self.reset_c()        
+        
+        for ob_id in obs_with_gt:
+            donor_loc_in_list = count11
+            alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx]
+            #append any additional format fields to alleles
+            if len(additional_field_idxs) > 0:
+                for idx_addnl in additional_field_idxs:
+                    fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl]
+                    alleles = alleles + '_' + fmt_val
+
+            if not alleles.startswith('.'):
+                ids = "_".join([list_val[x] for x in [0, 1, 3, 4]])
+                donor_var = f"{ids}_{alleles}"
+                while ob_id in self.curently_pushing:
+                    time.sleep(r*0.01)
+                self.curently_pushing.append(ob_id)           
+                try:
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                except:
+                    self.exclusive_donor_variants[ob_id]=set()
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                self.curently_pushing.remove(ob_id)
+                # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1']
+            count11+=1 
+
+    def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
+        to_export = exclusive_donor_variants
+        for val1 in record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    try:
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+                    except:
+                        to_export[k1]=set()
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+            os.remove(val1)
+        return to_export
+    
+    
+    def load_VCF_batch_paralel(self):
+        """
+        Load whole VCF file by utilising multiple cores to speed up loading of large cell files
+        -------------------
+        Initially designed to load VCF from cellSNP output, requiring 
+        1) all variants have the same format list;
+        2) a line starting with "#CHROM", with sample ids.
+        If these two requirements are satisfied, this function also supports general
+        VCF files, e.g., genotype for multiple samples.
+
+        Note, it may take a large memory, please filter the VCF with bcftools first.
+        """
+        
+        vcf_file = self.vcf_file
+        biallelic_only = self.biallelic_only
+        load_sample= self.load_sample
+        sparse = self.sparse
+        format_list= self.format_list
+        pool = mp.Pool(cpus)
+        
+        
+        import time
+        if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz":
+            infile = gzip.open(vcf_file, "rb")
+            is_gzip = True
+        else:
+            infile = open(vcf_file, "r")
+            is_gzip = False
+        
+        FixedINFO = {}
+        contig_lines = []
+        comment_lines = []
+        var_ids, obs_ids, obs_dat = [], [], []
+        count=0 #57077    
+        for line in infile:
+            count+=1
+            # if count>10000:
+            #     break
+            if is_gzip:
+                line = line.decode('utf-8')
+            if line.startswith("#"):
+                if line.startswith("##contig="):
+                    contig_lines.append(line.rstrip())
+                if line.startswith("#CHROM"):
+                    if load_sample:
+                        obs_ids = line.rstrip().split("\t")[9:]
+                        for ob_id in obs_ids:
+                            self.exclusive_donor_variants[ob_id]=set()
+                    key_ids = line[1:].rstrip().split("\t")[:8]
+                    for _key in key_ids:
+                        FixedINFO[_key] = []
+                else:
+                    comment_lines.append(line.rstrip())
+            else:
+                pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results)
+                del line
+        self.last_count=count
+        pool.close()
+        pool.join()
+        
+        output = self.combine_written_files(self.exclusive_donor_variants,self.record_dict)
+        
+        return output
+    
+"""Run CLI."""
+
+def get_options():
+    '''
+    Get options from the command line
+    '''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__))
+    parser.add_argument('--cpus', action='store', required=True, type=int)
+    parser.add_argument('--cell_vcf', action='store', required=True)
+    parser.add_argument('--cell_assignments', action='store', required=True)
+    parser.add_argument('--donor_assignments', action='store', required=True)
+    parser.add_argument('--gt_match_vcf', action='store', required=True)
+    parser.add_argument('--expected_vcf', action='store', required=True)
+    parser.add_argument('--informative_sites', action='store', required=True)
+    parser.add_argument('--uninformative_sites', action='store', required=True)
+    parser.add_argument('--outfile', action='store', required=True)
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--remove_AG', action='store_true')
+    args = parser.parse_args()
+
+    return args
+
+
+def get_sites_from_tsv(sites_file):
+    """
+    get sites frm a tsv file where cols are chrom, pos, id, ref, alt
+    assumes no multiallelics
+    """
+    sites = set()
+    with open(sites_file, 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            linedata = l.split('\t')
+            var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
+            sites.add(var)
+    return sites
+    
+
+def find(lst, a):
+    return [i for i, x in enumerate(lst) if x==a ]
+def norm_genotypes(expected_vars):
+    expected_vars = pd.DataFrame(expected_vars)
+    split_str=expected_vars[0].str.split("_")
+    expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+    expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+    expected_vars['vars'] = split_str.str[4]
+    expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+    expected_vars = expected_vars[expected_vars['vars']!='./.']
+    expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+    expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+    return expected_vars
+
+
+def donor_exclusive_sites(exclusive_don_variants2):
+    # Here we generate a function for determining the sites that are donor exclusive
+    donor_distinct_sites = {}
+    for col1 in exclusive_don_variants2.keys():
+        comparisons =[]
+        to_compare = []
+        for col2 in exclusive_don_variants2.keys():
+            if col1==col2:
+                # we set this as the unique entry
+                # print('1')
+                to_compare = set(exclusive_don_variants2[col2])
+            else:
+                # We combine all the variants in one list
+                comparisons+=list(exclusive_don_variants2[col2])
+                # print('2')
+        # print('comparison')  
+        comparisons_all = set(comparisons) 
+        comparisons_all_norm = norm_genotypes(comparisons_all)
+        comparisons_all=set(comparisons_all_norm['combo'])
+        
+        to_compare = set(to_compare)
+        to_compare_norm = norm_genotypes(to_compare)
+        to_compare=set(to_compare_norm['combo'])
+        # Make sure we account for hap types - phased/unphased 
+        distinct_donor_sites = to_compare - comparisons_all
+        donor_distinct_sites[col1]=distinct_donor_sites
+        # Perform the distinct set function.
+    return donor_distinct_sites   
+
+
+
+
+if __name__ == "__main__":
+
+    options = get_options()
+    cpus = options.cpus
+    outfile = options.outfile
+    cell_vcf=options.cell_vcf
+    donor_assignments=options.donor_assignments
+    gt_match_vcf=options.gt_match_vcf
+    expected_vcf=options.expected_vcf
+    cell_assignments=options.cell_assignments
+    informative_sites_file = options.informative_sites
+    uninformative_sites_file = options.uninformative_sites
+
+    informative_sites = get_sites_from_tsv(informative_sites_file)
+    uninformative_sites = get_sites_from_tsv(uninformative_sites_file)
+
+    exclusive_donor_variants = {} #This is where results are populated when mp process i used.
+    curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
+    All_Results={}
+    cell_concordance_table = {}
+
+    donor_assignments_table = pd.read_csv(donor_assignments)
+    cell_assignments_table = pd.read_csv(cell_assignments,sep='\t')
+    remove_AG = options.remove_AG
+    
+    if options.debug:
+        with open('tmp_GT_Expected_variants.pkl', 'rb') as f:
+            GT_Expected_variants = pickle.load(f)
+        with open('tmp_GT_Matched_variants.pkl', 'rb') as f:
+            GT_Matched_variants = pickle.load(f)   
+        with open('tmp_exclusive_cell_variants.pkl', 'rb') as f:
+            exclusive_cell_variants = pickle.load(f) 
+        with open('tmp_donor_distinct_sites.pkl', 'rb') as f:
+            donor_distinct_sites = pickle.load(f)  
+        with open('tmp_exclusive_don_variants.pkl', 'rb') as f:
+            exclusive_don_variants = pickle.load(f) 
+    else:  
+        print('---Loading genotype VCF----')   
+        if (os.path.exists(gt_match_vcf)):
+            loader2 = VCF_Loader(gt_match_vcf, biallelic_only=True,
+                            sparse=False, format_list=['GT'])
+            GT_Matched_variants = loader2.load_VCF_batch_paralel()
+            del loader2
+        else:
+            GT_Matched_variants = {}
+        
+        with open(f'tmp_GT_Matched_variants.pkl', 'wb') as f:
+            pickle.dump(GT_Matched_variants, f)
+        
+        print('---Loading cell VCF----')
+        loader1 = VCF_Loader(cell_vcf, biallelic_only=True,
+                            sparse=False, format_list=['GT', 'DP', 'AD', 'OTH'])
+        exclusive_cell_variants = loader1.load_VCF_batch_paralel()
+        del loader1
+        with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f:
+            pickle.dump(exclusive_cell_variants, f)
+
+        print('---Loading expected VCF----')
+        loader3 = VCF_Loader(expected_vcf, biallelic_only=True,
+                        sparse=False, format_list=['GT'])
+        GT_Expected_variants = loader3.load_VCF_batch_paralel()
+        del loader3
+
+        with open(f'tmp_GT_Expected_variants.pkl', 'wb') as f:
+            pickle.dump(GT_Expected_variants, f)
+ 
+        print('---Variant files loaded----')       
+        
+        exclusive_don_variants = GT_Expected_variants.keys()
+        content = [x for x in exclusive_don_variants if not x.startswith('donor')]
+        GT_Expected_variants = {key: GT_Expected_variants[key] for key in content}
+        
+        exclusive_don_variants = GT_Matched_variants.keys()
+        content = [x for x in exclusive_don_variants if not x.startswith('donor')]
+        GT_Matched_variants = {key: GT_Matched_variants[key] for key in content}
+        
+        exclusive_don_variants = GT_Expected_variants
+        for key in GT_Matched_variants.keys():
+            if key in exclusive_don_variants.keys():
+                _=''
+            else:
+                exclusive_don_variants[key]=GT_Matched_variants[key]
+        
+        with open(f'tmp_exclusive_don_variants.pkl', 'wb') as f:
+            pickle.dump(exclusive_don_variants, f)
+        donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants)
+        with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f:
+            pickle.dump(donor_distinct_sites, f)
+
+    cell_concordance_table = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites, informative_sites, uninformative_sites).conc_table()
+    
+
+    result = pd.DataFrame(cell_concordance_table).T
+    try:
+        site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']]
+        result.drop(columns=['Concordant_Site_Identities','Discordant_Site_Identities'],inplace=True)
+        site_identities.to_csv(f"site_identities_{outfile}",sep='\t')
+    except:
+        _='sample_hasnt_matched_any_gt --- most likely too little cells assigned'
+    result.to_csv(outfile,sep='\t')
+    
+    print('Processing Done')
\ No newline at end of file
diff --git a/bin/concordance_calculations_donor_exclusive_read_level.py b/bin/concordance_calculations_donor_exclusive_read_level.py
index 3b336c3c..5497e7e3 100755
--- a/bin/concordance_calculations_donor_exclusive_read_level.py
+++ b/bin/concordance_calculations_donor_exclusive_read_level.py
@@ -18,323 +18,317 @@
 
 
 class Concordances:
-        def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
-            self.reset()
-            self.donor_assignments_table=donor_assignments_table
-            self.cell_assignments_table=cell_assignments_table
-            self.exclusive_don_variants=exclusive_don_variants
-            self.exclusive_cell_variants=exclusive_cell_variants
-            self.donor_distinct_sites=donor_distinct_sites
-            self.informative_sites = informative_sites
-            self.uninformative_sites = uninformative_sites
-            self.record_dict={}
-
-        def norm_genotypes(self,expected_vars):
-            expected_vars = pd.DataFrame(expected_vars)
-            if len(expected_vars) > 0:
-                split_str=expected_vars[0].str.split("_")
-                expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
-                expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
-                expected_vars['vars'] = split_str.str[4]
-                expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
-                expected_vars = expected_vars[expected_vars['vars']!='./.']
-                expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
-                expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
-            return expected_vars
-        
-        def reset(self):
-            self.cell_concordance_table ={}
-
-        # def get_sites_from_tsv(self, sites_file):
-        #     """
-        #     get sites frm a tsv file where cols are chrom, pos, id, ref, alt
-        #     assumes no multiallelics
-        #     """
-        #     sites = set()
-        #     with open(sites_file, 'r') as f:
-        #         lines = f.readlines()
-        #         for l in lines:
-        #             linedata = l.split('\t')
-        #             var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
-        #             sites.add(var)
-        #     return sites
-
-
-        def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
-            '''
-            take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
-            sites and relaxed concordant sites
-            1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
-            2) if you have a 0/0 you can not get a 1/1 or 0/1
-            3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
-            So - each obversed cellsnp allele must be in the array SNP gtype
-            '''
-            true_discordant = 0
-            relaxed_concordant = 0
-            relaxed_concordant_informative = 0
-            true_discordant_uninformative = 0
-
-            for i in range(0, len(snp_gtypes)):
-                discordant = False
-                snp_data = snp_gtypes[i].split('_')
-                cellsnp_data = cellsnp_gtypes[i].split('_')
-
-                # the below will no longer work due to differing length of input strings
-                # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
-                # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
-
-
-                snp_alleles = [snp_data[4][0], snp_data[4][2]]
-                cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
-
-                snp_alleles_set = set(snp_alleles)
-                cellsnp_alleles_set = set(cellsnp_alleles)
-               
-                snp_var = ('_').join(snp_data[0:4])
-                cellsnp_var = ('_').join(cellsnp_data[0:4])
-
-                if not cellsnp_var == snp_var:
-                    print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
-                    exit(1)
-                else:
-                    for allele in cellsnp_alleles_set:
-                        if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
-                            discordant = True
-                
-                if discordant == True:
-                    true_discordant+=1
-                    if snp_var in self.uninformative_sites:
-                        true_discordant_uninformative+=1
-                else:
-                    relaxed_concordant+=1
-                    if snp_var in self.informative_sites:
-                        relaxed_concordant_informative+=1
-
-            return true_discordant, relaxed_concordant, relaxed_concordant_informative, true_discordant_uninformative
-
-
-        def read_condordance(self, expected_vars, cell_vars):
-            '''
-            get read level concordance using DP, AD and OTH format fields
-            ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
-            ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
-            ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
-            '''
-            if not len(expected_vars) == len(cell_vars):
-                print("length mismatch between expected vars and cell vars")
-                exit(1)
+    def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
+        self.reset()
+        self.donor_assignments_table=donor_assignments_table
+        self.cell_assignments_table=cell_assignments_table
+        self.exclusive_don_variants=exclusive_don_variants
+        self.exclusive_cell_variants=exclusive_cell_variants
+        self.donor_distinct_sites=donor_distinct_sites
+        self.informative_sites = informative_sites
+        self.uninformative_sites = uninformative_sites
+        self.record_dict={}
+
+    def norm_genotypes(self,expected_vars):
+        expected_vars = pd.DataFrame(expected_vars)
+        if len(expected_vars) > 0:
+            split_str=expected_vars[0].str.split("_")
+            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+            expected_vars['vars'] = split_str.str[4]
+            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+            expected_vars = expected_vars[expected_vars['vars']!='./.']
+            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+        return expected_vars
+    
+    def reset(self):
+        self.cell_concordance_table ={}
+
+    # def get_sites_from_tsv(self, sites_file):
+    #     """
+    #     get sites frm a tsv file where cols are chrom, pos, id, ref, alt
+    #     assumes no multiallelics
+    #     """
+    #     sites = set()
+    #     with open(sites_file, 'r') as f:
+    #         lines = f.readlines()
+    #         for l in lines:
+    #             linedata = l.split('\t')
+    #             var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
+    #             sites.add(var)
+    #     return sites
+
+
+    def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
+        '''
+        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+        sites and relaxed concordant sites
+        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+        2) if you have a 0/0 you can not get a 1/1 or 0/1
+        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+        So - each obversed cellsnp allele must be in the array SNP gtype
+        '''
+        true_discordant = 0
+        relaxed_concordant = 0
+        relaxed_concordant_informative = 0
+        true_discordant_uninformative = 0
+
+        for i in range(0, len(snp_gtypes)):
+            discordant = False
+            snp_data = snp_gtypes[i].split('_')
+            cellsnp_data = cellsnp_gtypes[i].split('_')
+
+            # the below will no longer work due to differing length of input strings
+            # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
+            # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
 
-            total_sites = len(expected_vars)
-            #add cols for DP, AD< OTH
-            cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
-            cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
-            cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
-            total_dp = cell_vars['DP'].sum()
-            total_oth = cell_vars['OTH'].sum()
-            total_reads = total_dp + total_oth
-
-            # expected genotype 0/0
-            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
-            hom_ref_sites = set(expected_hom_ref['ids'])
-            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
-            ad_hom_ref = cell_vars2['AD'].sum()
-            oth_hom_ref = cell_vars2['OTH'].sum() 
-            discordant_hom_ref = ad_hom_ref + oth_hom_ref
-
-            # expected genotype 0/1 or 1/0
-            hets = ['0/1', '1/0']
-            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
-            het_sites = set(expected_het['ids'])
-            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
-            discordant_het = cell_vars3['OTH'].sum()
-
-            # expected genotype 1/1
-            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
-            hom_alt_sites = set(expected_hom_alt['ids'])
-            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
-            # DP + OTH - AD
-            ad_hom_alt = cell_vars4['AD'].sum()
-            dp_hom_alt = cell_vars4['DP'].sum()
-            oth_hom_alt = cell_vars4['OTH'].sum()
-            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
-
-            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
-
-            return total_sites, total_reads, discordant_reads
-        
 
-        def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
-            # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
-            # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
-            # Author: M.Ozols
+            snp_alleles = [snp_data[4][0], snp_data[4][2]]
+            cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
+
+            snp_alleles_set = set(snp_alleles)
+            cellsnp_alleles_set = set(cellsnp_alleles)
             
-            cell_vars_norm = self.norm_genotypes(cell_vars)
-
-            if len(cell_vars_norm) > 0:
-                Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
-                expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)]
-                cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)]
-                # print(cell_vars_norm)
-                # print(expected_vars2)
-                # print(cell_vars2)
-                # exit(0)
-                Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
-                Discodrant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
-                disc = pd.DataFrame(Discodrant_sites,columns=['combo_x'])
-                df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
-                disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
-                disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
-                disc_sites = ';'.join(disc2['expected_retrieved'])
-                #find truly discordant sites
-                true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
-                #find discordant reads
-                total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2)
+            snp_var = ('_').join(snp_data[0:4])
+            cellsnp_var = ('_').join(cellsnp_data[0:4])
+
+            if not cellsnp_var == snp_var:
+                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
+                exit(1)
             else:
-                Total_Overlappin_sites = set()
-                Concordant_Sites = set()
-                Discodrant_sites = set()
-                disc_sites = ''
-                true_discordant_count = 0
-                relaxed_concordant_count = 0
-                total_sites = 0
-                total_reads = 0
-                discordant_reads = 0
-
-            return Concordant_Sites, Discodrant_sites, Total_Overlappin_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads
-        
+                for allele in cellsnp_alleles_set:
+                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                        discordant = True
+            
+            if discordant == True:
+                true_discordant+=1
+                if snp_var in self.uninformative_sites:
+                    true_discordant_uninformative+=1
+            else:
+                relaxed_concordant+=1
+                if snp_var in self.informative_sites:
+                    relaxed_concordant_informative+=1
+
+        return true_discordant, relaxed_concordant, relaxed_concordant_informative, true_discordant_uninformative
+
+
+    def read_condordance(self, expected_vars, cell_vars):
+        '''
+        get read level concordance using DP, AD and OTH format fields
+        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+        '''
+        if not len(expected_vars) == len(cell_vars):
+            print("length mismatch between expected vars and cell vars")
+            exit(1)
+
+        total_sites = len(expected_vars)
+        #add cols for DP, AD< OTH
+        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        total_dp = cell_vars['DP'].sum()
+        total_oth = cell_vars['OTH'].sum()
+        total_reads = total_dp + total_oth
+
+        # expected genotype 0/0
+        expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+        hom_ref_sites = set(expected_hom_ref['ids'])
+        cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+        ad_hom_ref = cell_vars2['AD'].sum()
+        oth_hom_ref = cell_vars2['OTH'].sum() 
+        discordant_hom_ref = ad_hom_ref + oth_hom_ref
+
+        # expected genotype 0/1 or 1/0
+        hets = ['0/1', '1/0']
+        expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+        het_sites = set(expected_het['ids'])
+        cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+        discordant_het = cell_vars3['OTH'].sum()
+
+        # expected genotype 1/1
+        expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+        hom_alt_sites = set(expected_hom_alt['ids'])
+        cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+        # DP + OTH - AD
+        ad_hom_alt = cell_vars4['AD'].sum()
+        dp_hom_alt = cell_vars4['DP'].sum()
+        oth_hom_alt = cell_vars4['OTH'].sum()
+        discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+
+        discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+
+        return total_sites, total_reads, discordant_reads
+    
 
-        def set_results(self,to_set,id):
-            # Recod to disk to save the loading mmeory time.
-            with open(f'tmp_{id}.pkl', 'wb') as f:
-                pickle.dump(to_set, f)
-            self.record_dict[id]=f'tmp_{id}.pkl'
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
+        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+        # Author: M.Ozols
         
-        def append_results_cell_concordances(self,result):
-            count=result[11]
-            try:
-                percent_concordant = result[2]/(result[3]+result[2])*100
-            except:
-                percent_concordant = 0
-            
-            try:
-                percent_discordant = result[3]/(result[3]+result[2])*100
-            except:
-                percent_discordant = 0
+        cell_vars_norm = self.norm_genotypes(cell_vars)
+
+        if len(cell_vars_norm) > 0:
+            Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)]
+
+            Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+            Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
+            disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+            df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+            disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+            disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+            disc_sites = ';'.join(disc2['expected_retrieved'])
+            #find truly discordant sites
+            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
+            #find discordant reads
+            total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2)
+        else:
+            Total_Overlappin_sites = set()
+            Concordant_Sites = set()
+            Discordant_sites = set()
+            disc_sites = ''
+            true_discordant_count = 0
+            relaxed_concordant_count = 0
+            total_sites = 0
+            total_reads = 0
+            discordant_reads = 0
+
+        return Concordant_Sites, Discordant_sites, Total_Overlappin_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads
+    
 
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+    
+    def append_results_cell_concordances(self,result):
+        count=result[11]
+        try:
+            percent_concordant = result[2]/(result[3]+result[2])*100
+        except:
+            percent_concordant = 0
+        
+        try:
+            percent_discordant = result[3]/(result[3]+result[2])*100
+        except:
+            percent_discordant = 0
+
+        try:
+            percent_relaxed_concordant = result[4]/(result[4]+result[5])*100
+        except:
+            percent_relaxed_concordant = 0
+        
+        try:
+            percent_strict_discordant = result[5]/(result[4]+result[5])*100
+        except:
+            percent_strict_discordant = 0
+
+        try:
+            read_discordance = result[15]/result[13]
+        except:
+            read_discordance = 0
+
+        print(count)
+        self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
+                                                                'GT 2':result[1],
+                                                                'Nr_Concordant':result[2],
+                                                                'Nr_Discordant':result[3],
+                                                                'Nr_Relaxed_concordant':result[4],
+                                                                'Nr_strict_discordant':result[5],
+                                                                'Percent Concordant':percent_concordant,
+                                                                'Percent Discordant':percent_discordant,
+                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
+                                                                'Percent_strict_discordant': percent_strict_discordant,
+                                                                'Nr_concordant_informative': result[6],
+                                                                'Nr_discordant_uninformative': result[7],
+                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result[8],
+                                                                'Nr_donor_distinct_sites_within_pool_individuals':result[10],
+                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result[9],
+                                                                'Discordant_Site_Identities':result[12],
+                                                                'Total_sites': result[13],
+                                                                'Total_reads': result[14],
+                                                                'Discordant_reads': result[15],
+                                                                'Discordant_reads_by_n_sites': read_discordance
+                                                                }   
+        
+        if (count % 200 == 0):
+            print(f'recording and resetting memory {count}')
+            # self.record_dict[count]=self.exclusive_donor_variants
+            self.set_results(self.cell_concordance_table,count)
+            self.reset()  
+        _=""
+    
+    def combine_written_files(self):#this one is for concordance class
+        to_export = self.cell_concordance_table
+        for val1 in self.record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    to_export[k1]=loaded_dict[k1]
+            os.remove(val1)
+        return to_export
+    
+    
+    def conc_table(self):
+        donor_assignments_table=self.donor_assignments_table
+        cell_assignments_table=self.cell_assignments_table
+        exclusive_don_variants=self.exclusive_don_variants
+        exclusive_cell_variants= self.exclusive_cell_variants
+        
+        pool = mp.Pool(cpus)
+        count = 0
+        for i,row1 in donor_assignments_table.iterrows():
+            donor_in_question = row1['donor_query']
+            donor_gt_match = row1['donor_gt']
+            if (donor_gt_match=='NONE'):
+                continue
+            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
             try:
-                percent_relaxed_concordant = result[4]/(result[4]+result[5])*100
+                expected_vars = exclusive_don_variants[donor_gt_match]
             except:
-                percent_relaxed_concordant = 0
+                _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances'
+                continue
             
+            expected_vars_norm = self.norm_genotypes(expected_vars)
             try:
-                percent_strict_discordant = result[5]/(result[4]+result[5])*100
-            except:
-                percent_strict_discordant = 0
-
-            try:
-                read_discordance = result[15]/result[13]
+                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                dds = self.donor_distinct_sites[donor_gt_match]
             except:
-                read_discordance = 0
-
-            print(count)
-            self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
-                                                                    'GT 2':result[1],
-                                                                    'Nr_Concordant':result[2],
-                                                                    'Nr_Discordant':result[3],
-                                                                    'Nr_Relaxed_concordant':result[4],
-                                                                    'Nr_strict_discordant':result[5],
-                                                                    'Percent Concordant':percent_concordant,
-                                                                    'Percent Discordant':percent_discordant,
-                                                                    'Percent_relaxed_concordant': percent_relaxed_concordant,
-                                                                    'Percent_strict_discordant': percent_strict_discordant,
-                                                                    'Nr_concordant_informative': result[6],
-                                                                    'Nr_discordant_uninformative': result[7],
-                                                                    'NrTotal_Overlapping_sites_between_two_genotypes':result[8],
-                                                                    'Nr_donor_distinct_sites_within_pool_individuals':result[10],
-                                                                    'Number_of_sites_that_are_donor_concordant_and_exclusive':result[9],
-                                                                    'Discordant_Site_Identities':result[12],
-                                                                    'Total_sites': result[13],
-                                                                    'Total_reads': result[14],
-                                                                    'Discordant_reads': result[15],
-                                                                    'Discordant_reads_by_n_sites': read_discordance
-                                                                    }   
+                continue
             
-            if (count % 200 == 0):
-                print(f'recording and resetting memory {count}')
-                # self.record_dict[count]=self.exclusive_donor_variants
-                self.set_results(self.cell_concordance_table,count)
-                self.reset()  
-            _=""
-        
-        def combine_written_files(self):#this one is for concordance class
-            to_export = self.cell_concordance_table
-            for val1 in self.record_dict.values():
-                # here remove the int files.
-                print(f"merging temp file: {val1}")
-                with open(val1, 'rb') as f:
-                    loaded_dict = pickle.load(f)
-                    for k1 in loaded_dict.keys():
-                        to_export[k1]=loaded_dict[k1]
-                os.remove(val1)
-            return to_export
-        
-        
-        def conc_table(self):
-            donor_assignments_table=self.donor_assignments_table
-            cell_assignments_table=self.cell_assignments_table
-            exclusive_don_variants=self.exclusive_don_variants
-            exclusive_cell_variants= self.exclusive_cell_variants
-            
-            pool = mp.Pool(cpus)
-            count = 0
-            for i,row1 in donor_assignments_table.iterrows():
-                donor_in_question = row1['donor_query']
-                donor_gt_match = row1['donor_gt']
-                if (donor_gt_match=='NONE'):
-                    continue
-                Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
-                try:
-                    expected_vars = exclusive_don_variants[donor_gt_match]
-                except:
-                    _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances'
-                    continue
+            for cell1 in Cells_to_keep_pre:
+                count+=1
+                cell_vars = exclusive_cell_variants[cell1]
+                self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
+                # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
+                result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count)
+                self.append_results_cell_concordances(result1)
                 
-                expected_vars_norm = self.norm_genotypes(expected_vars)
-                try:
-                    # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
-                    dds = self.donor_distinct_sites[donor_gt_match]
-                except:
-                    continue
-                
-                for cell1 in Cells_to_keep_pre:
-                    count+=1
-                    # if count>800:
-                    #     break
-                    cell_vars = exclusive_cell_variants[cell1]
-                    # cell_vars_dp = exclusive_cell_variants_dp[cell1]
-
-                    self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
-                    # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
-                    result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count)
-                    self.append_results_cell_concordances(result1)
-                    
-            pool.close()
-            pool.join()
-            output = self.combine_written_files()
-            return output
+        pool.close()
+        pool.join()
+        output = self.combine_written_files()
+        return output
+    
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
+        Nr_donor_distinct_sites = len(dds)
+        Concordant_Sites, Discordant_sites, Total_Overlappin_sites,discordant_sites,cell_vars_norm, Nr_strict_discordant, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
         
-        def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
-            Nr_donor_distinct_sites = len(dds)
-            Concordant_Sites, Discodrant_sites, Total_Overlappin_sites,discordant_sites,cell_vars_norm, Nr_strict_discordant, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
-            Nr_Concordant = len(Concordant_Sites)
-            Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
-            Nr_Discordant = len(Discodrant_sites)
-            Nr_Total_Overlapping_sites = len(Total_Overlappin_sites)
-            Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
-            Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
-
-            return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
-                    Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads]
+        Nr_Concordant = len(Concordant_Sites)
+        Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
+        Nr_Discordant = len(Discordant_sites)
+        Nr_Total_Overlapping_sites = len(Total_Overlappin_sites)
+        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
+        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+
+        return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
+                Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads]
         
         
 class VCF_Loader:
diff --git a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py
index 6a066d85..f859605a 100755
--- a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py
+++ b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py
@@ -18,387 +18,383 @@
 
 
 class Concordances:
-        def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
-            self.reset()
-            self.donor_assignments_table=donor_assignments_table
-            self.cell_assignments_table=cell_assignments_table
-            self.exclusive_don_variants=exclusive_don_variants
-            self.exclusive_cell_variants=exclusive_cell_variants
-            self.donor_distinct_sites=donor_distinct_sites
-            self.informative_sites = informative_sites
-            self.uninformative_sites = uninformative_sites
-            self.record_dict={}
-
-        def norm_genotypes(self,expected_vars):
-            expected_vars = pd.DataFrame(expected_vars)
-            if len(expected_vars) > 0:
-                split_str=expected_vars[0].str.split("_")
-                expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
-                expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
-                expected_vars['vars'] = split_str.str[4]
-                expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
-                expected_vars = expected_vars[expected_vars['vars']!='./.']
-                expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
-                expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
-            return expected_vars
-        
-        def reset(self):
-            self.cell_concordance_table ={}
-
-        # def get_sites_from_tsv(self, sites_file):
-        #     """
-        #     get sites frm a tsv file where cols are chrom, pos, id, ref, alt
-        #     assumes no multiallelics
-        #     """
-        #     sites = set()
-        #     with open(sites_file, 'r') as f:
-        #         lines = f.readlines()
-        #         for l in lines:
-        #             linedata = l.split('\t')
-        #             var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
-        #             sites.add(var)
-        #     return sites
-
-
-        def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
-            '''
-            take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
-            sites and relaxed concordant sites
-            1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
-            2) if you have a 0/0 you can not get a 1/1 or 0/1
-            3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
-            So - each obversed cellsnp allele must be in the array SNP gtype
-            '''
-            true_discordant = 0
-            relaxed_concordant = 0
-            relaxed_concordant_informative = 0
-            relaxed_concordant_uninformative = 0
-            true_discordant_informative = 0
-            true_discordant_uninformative = 0
-
-            for i in range(0, len(snp_gtypes)):
-                discordant = False
-                snp_data = snp_gtypes[i].split('_')
-                cellsnp_data = cellsnp_gtypes[i].split('_')
-
-                # the below will no longer work due to differing length of input strings
-                # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
-                # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
-
-
-                snp_alleles = [snp_data[4][0], snp_data[4][2]]
-                cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
-
-                snp_alleles_set = set(snp_alleles)
-                cellsnp_alleles_set = set(cellsnp_alleles)
-               
-                snp_var = ('_').join(snp_data[0:4])
-                cellsnp_var = ('_').join(cellsnp_data[0:4])
-
-                if not cellsnp_var == snp_var:
-                    print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
-                    exit(1)
-                else:
-                    for allele in cellsnp_alleles_set:
-                        if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
-                            discordant = True
-                
-                if discordant == True:
-                    true_discordant+=1
-                    if snp_var in self.uninformative_sites:
-                        true_discordant_uninformative+=1
-                    elif snp_var in self.informative_sites:
-                        true_discordant_informative+=1
-                else:
-                    relaxed_concordant+=1
-                    if snp_var in self.uninformative_sites:
-                        relaxed_concordant_uninformative+=1
-                    elif snp_var in self.informative_sites:
-                        relaxed_concordant_informative+=1
-
-            return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative
-
-
-        def read_condordance(self, expected_vars, cell_vars):
-            '''
-            get read level concordance using DP, AD and OTH format fields
-            ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
-            ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
-            ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
-            '''
-            if not len(expected_vars) == len(cell_vars):
-                print("length mismatch between expected vars and cell vars")
-                exit(1)
+    def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
+        self.reset()
+        self.donor_assignments_table=donor_assignments_table
+        self.cell_assignments_table=cell_assignments_table
+        self.exclusive_don_variants=exclusive_don_variants
+        self.exclusive_cell_variants=exclusive_cell_variants
+        self.donor_distinct_sites=donor_distinct_sites
+        self.informative_sites = informative_sites
+        self.uninformative_sites = uninformative_sites
+        self.record_dict={}
 
-            total_sites = len(expected_vars)
-            #add cols for DP, AD< OTH
-            cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
-            cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
-            cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
-            #split to informative and uninformative sites
-            mask_i = cell_vars['ids'].isin(self.informative_sites)
-            cell_vars_informative = cell_vars[mask_i]
-            mask_u = cell_vars['ids'].isin(self.uninformative_sites)
-            cell_vars_uninformative = cell_vars[mask_u]
-            informative_sites = len(cell_vars_informative)
-            uninformative_sites = len(cell_vars_uninformative)
-
-            total_dp = cell_vars['DP'].sum()
-            total_oth = cell_vars['OTH'].sum()
-            total_reads = total_dp + total_oth
-            total_dp_inf = cell_vars_informative['DP'].sum()
-            total_oth_inf = cell_vars_informative['OTH'].sum()
-            total_reads_informative = total_dp_inf + total_oth_inf
-            total_dp_uninf = cell_vars_uninformative['DP'].sum()
-            total_oth_uninf = cell_vars_uninformative['OTH'].sum()
-            total_reads_uninformative = total_dp_uninf + total_oth_uninf            
-
-            # expected genotype 0/0
-            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
-            hom_ref_sites = set(expected_hom_ref['ids'])
-            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
-            cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)]
-            cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)]
-            ad_hom_ref = cell_vars2['AD'].sum()
-            oth_hom_ref = cell_vars2['OTH'].sum() 
-            discordant_hom_ref = ad_hom_ref + oth_hom_ref
-            ad_hom_ref_inf = cell_vars_inf_2['AD'].sum()
-            oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() 
-            discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf
-            ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum()
-            oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() 
-            discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf
-
-            # expected genotype 0/1 or 1/0
-            hets = ['0/1', '1/0']
-            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
-            het_sites = set(expected_het['ids'])
-            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
-            cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)]
-            cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)]
-            discordant_het = cell_vars3['OTH'].sum()
-            discordant_het_informative = cell_vars_inf_3['OTH'].sum()
-            discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum()
-
-            # expected genotype 1/1
-            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
-            hom_alt_sites = set(expected_hom_alt['ids'])
-            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
-            cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)]
-            cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)]
-            # DP + OTH - AD
-            ad_hom_alt = cell_vars4['AD'].sum()
-            dp_hom_alt = cell_vars4['DP'].sum()
-            oth_hom_alt = cell_vars4['OTH'].sum()
-            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
-            ad_hom_alt_inf = cell_vars_inf_4['AD'].sum()
-            dp_hom_alt_inf = cell_vars_inf_4['DP'].sum()
-            oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum()
-            discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf
-            ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum()
-            dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum()
-            oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum()
-            discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf
-
-            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
-            discordant_reads_informative =  discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative
-            discordant_reads_uninformative =  discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative
-
-            return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative
-        
+    def norm_genotypes(self,expected_vars):
+        expected_vars = pd.DataFrame(expected_vars)
+        if len(expected_vars) > 0:
+            split_str=expected_vars[0].str.split("_")
+            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+            expected_vars['vars'] = split_str.str[4]
+            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+            expected_vars = expected_vars[expected_vars['vars']!='./.']
+            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+        return expected_vars
+    
+    def reset(self):
+        self.cell_concordance_table ={}
+
+    # def get_sites_from_tsv(self, sites_file):
+    #     """
+    #     get sites frm a tsv file where cols are chrom, pos, id, ref, alt
+    #     assumes no multiallelics
+    #     """
+    #     sites = set()
+    #     with open(sites_file, 'r') as f:
+    #         lines = f.readlines()
+    #         for l in lines:
+    #             linedata = l.split('\t')
+    #             var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
+    #             sites.add(var)
+    #     return sites
+
+
+    def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
+        '''
+        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+        sites and relaxed concordant sites
+        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+        2) if you have a 0/0 you can not get a 1/1 or 0/1
+        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+        So - each obversed cellsnp allele must be in the array SNP gtype
+        '''
+        true_discordant = 0
+        relaxed_concordant = 0
+        relaxed_concordant_informative = 0
+        relaxed_concordant_uninformative = 0
+        true_discordant_informative = 0
+        true_discordant_uninformative = 0
+
+        for i in range(0, len(snp_gtypes)):
+            discordant = False
+            snp_data = snp_gtypes[i].split('_')
+            cellsnp_data = cellsnp_gtypes[i].split('_')
+
+            # the below will no longer work due to differing length of input strings
+            # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
+            # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
 
-        def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
-            # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
-            # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
-            # Author: M.Ozols
+
+            snp_alleles = [snp_data[4][0], snp_data[4][2]]
+            cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
+
+            snp_alleles_set = set(snp_alleles)
+            cellsnp_alleles_set = set(cellsnp_alleles)
             
-            cell_vars_norm = self.norm_genotypes(cell_vars)
-
-            if len(cell_vars_norm) > 0:
-                Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
-                expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
-                cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
-                # print(cell_vars_norm)
-                # print(expected_vars2)
-                # print(cell_vars2)
-                # exit(0)
-                Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
-                Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
-                disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
-                df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
-                disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
-                disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
-                disc_sites = ';'.join(disc2['expected_retrieved'])
-                #find truly discordant sites
-                true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
-                #find discordant reads
-                total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative = self.read_condordance(expected_vars2, cell_vars2)
+            snp_var = ('_').join(snp_data[0:4])
+            cellsnp_var = ('_').join(cellsnp_data[0:4])
+
+            if not cellsnp_var == snp_var:
+                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
+                exit(1)
             else:
-                Total_Overlapping_sites = set()
-                Concordant_Sites = set()
-                Discordant_sites = set()
-                disc_sites = ''
-                true_discordant_count = 0
-                relaxed_concordant_count = 0
-                total_sites = 0
-      
-                discordant_reads = 0
-
-            return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative
-        
+                for allele in cellsnp_alleles_set:
+                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                        discordant = True
+            
+            if discordant == True:
+                true_discordant+=1
+                if snp_var in self.uninformative_sites:
+                    true_discordant_uninformative+=1
+                elif snp_var in self.informative_sites:
+                    true_discordant_informative+=1
+            else:
+                relaxed_concordant+=1
+                if snp_var in self.uninformative_sites:
+                    relaxed_concordant_uninformative+=1
+                elif snp_var in self.informative_sites:
+                    relaxed_concordant_informative+=1
+
+        return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative
 
-        def set_results(self,to_set,id):
-            # Recod to disk to save the loading mmeory time.
-            with open(f'tmp_{id}.pkl', 'wb') as f:
-                pickle.dump(to_set, f)
-            self.record_dict[id]=f'tmp_{id}.pkl'
+
+    def read_condordance(self, expected_vars, cell_vars):
+        '''
+        get read level concordance using DP, AD and OTH format fields
+        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+        '''
+        if not len(expected_vars) == len(cell_vars):
+            print("length mismatch between expected vars and cell vars")
+            exit(1)
+
+        total_sites = len(expected_vars)
+        #add cols for DP, AD< OTH
+        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        #split to informative and uninformative sites
+        mask_i = cell_vars['ids'].isin(self.informative_sites)
+        cell_vars_informative = cell_vars[mask_i]
+        mask_u = cell_vars['ids'].isin(self.uninformative_sites)
+        cell_vars_uninformative = cell_vars[mask_u]
+        informative_sites = len(cell_vars_informative)
+        uninformative_sites = len(cell_vars_uninformative)
+
+        total_dp = cell_vars['DP'].sum()
+        total_oth = cell_vars['OTH'].sum()
+        total_reads = total_dp + total_oth
+        total_dp_inf = cell_vars_informative['DP'].sum()
+        total_oth_inf = cell_vars_informative['OTH'].sum()
+        total_reads_informative = total_dp_inf + total_oth_inf
+        total_dp_uninf = cell_vars_uninformative['DP'].sum()
+        total_oth_uninf = cell_vars_uninformative['OTH'].sum()
+        total_reads_uninformative = total_dp_uninf + total_oth_uninf            
+
+        # expected genotype 0/0
+        expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+        hom_ref_sites = set(expected_hom_ref['ids'])
+        cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+        cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)]
+        cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)]
+        ad_hom_ref = cell_vars2['AD'].sum()
+        oth_hom_ref = cell_vars2['OTH'].sum() 
+        discordant_hom_ref = ad_hom_ref + oth_hom_ref
+        ad_hom_ref_inf = cell_vars_inf_2['AD'].sum()
+        oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() 
+        discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf
+        ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum()
+        oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() 
+        discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf
+
+        # expected genotype 0/1 or 1/0
+        hets = ['0/1', '1/0']
+        expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+        het_sites = set(expected_het['ids'])
+        cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+        cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)]
+        cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)]
+        discordant_het = cell_vars3['OTH'].sum()
+        discordant_het_informative = cell_vars_inf_3['OTH'].sum()
+        discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum()
+
+        # expected genotype 1/1
+        expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+        hom_alt_sites = set(expected_hom_alt['ids'])
+        cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+        cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)]
+        cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)]
+        # DP + OTH - AD
+        ad_hom_alt = cell_vars4['AD'].sum()
+        dp_hom_alt = cell_vars4['DP'].sum()
+        oth_hom_alt = cell_vars4['OTH'].sum()
+        discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+        ad_hom_alt_inf = cell_vars_inf_4['AD'].sum()
+        dp_hom_alt_inf = cell_vars_inf_4['DP'].sum()
+        oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum()
+        discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf
+        ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum()
+        dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum()
+        oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum()
+        discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf
+
+        discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+        discordant_reads_informative =  discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative
+        discordant_reads_uninformative =  discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative
+
+        return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative
+    
+
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
+        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+        # Author: M.Ozols
         
-        def append_results_cell_concordances(self,result):
-            count=result[13]
-            try:
-                percent_concordant = result[2]/(result[3]+result[2])*100
-            except:
-                percent_concordant = 0
-            
-            try:
-                percent_discordant = result[3]/(result[3]+result[2])*100
-            except:
-                percent_discordant = 0
+        cell_vars_norm = self.norm_genotypes(cell_vars)
+
+        if len(cell_vars_norm) > 0:
+            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+            Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
+            disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+            df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+            disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+            disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+            disc_sites = ';'.join(disc2['expected_retrieved'])
+            #find truly discordant sites
+            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
+            #find discordant reads
+            total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative = self.read_condordance(expected_vars2, cell_vars2)
+        else:
+            Total_Overlapping_sites = set()
+            Concordant_Sites = set()
+            Discordant_sites = set()
+            disc_sites = ''
+            true_discordant_count = 0
+            relaxed_concordant_count = 0
+            total_sites = 0
+    
+            discordant_reads = 0
 
-            try:
-                percent_relaxed_concordant = result[4]/(result[4]+result[5])*100
-            except:
-                percent_relaxed_concordant = 0
-            
-            try:
-                percent_strict_discordant = result[5]/(result[4]+result[5])*100
-            except:
-                percent_strict_discordant = 0
+        return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative
+    
 
-            try:
-                read_discordance = result[21]/result[15]
-            except:
-                read_discordance = 0
-
-            donor = result[1]
-            cohort = 'UNKNOWN'
-            donor_split = donor.split("_")
-            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
-                cohort = 'UKB'
-            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
-                cohort = 'ELGH'
-
-            print(count)
-            self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
-                                                                    'GT 2':result[1],
-                                                                    'cohort': cohort,
-                                                                    'Nr_Concordant':result[2],
-                                                                    'Nr_Discordant':result[3],
-                                                                    'Nr_Relaxed_concordant':result[4],
-                                                                    'Nr_strict_discordant':result[5],
-                                                                    'Percent Concordant':percent_concordant,
-                                                                    'Percent Discordant':percent_discordant,
-                                                                    'Percent_relaxed_concordant': percent_relaxed_concordant,
-                                                                    'Percent_strict_discordant': percent_strict_discordant,
-                                                                    'Nr_concordant_informative': result[6],
-                                                                    'Nr_concordant_uninformative': result[7],
-                                                                    'Nr_discordant_informative': result[8],
-                                                                    'Nr_discordant_uninformative': result[9],
-                                                                    'NrTotal_Overlapping_sites_between_two_genotypes':result[10],
-                                                                    'Nr_donor_distinct_sites_within_pool_individuals':result[12],
-                                                                    'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11],
-                                                                    'Discordant_Site_Identities':result[14],
-                                                                    'Total_sites': result[15],
-                                                                    'Total_informative_sites': result[16],
-                                                                    'Total_uninformative_sites': result[17],
-                                                                    'Total_reads': result[18],
-                                                                    'Total_reads_informative': result[19],
-                                                                    'Total_reads_uninformative': result[20],
-                                                                    'Discordant_reads': result[21],
-                                                                    'Discordant_reads_informtive': result[22],
-                                                                    'Discordant_reads_uninformtive': result[23],
-                                                                    'Discordant_reads_by_n_sites': read_discordance
-                                                                    }   
-            
-            if (count % 200 == 0):
-                print(f'recording and resetting memory {count}')
-                # self.record_dict[count]=self.exclusive_donor_variants
-                self.set_results(self.cell_concordance_table,count)
-                self.reset()  
-            _=""
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+    
+    def append_results_cell_concordances(self,result):
+        count=result[13]
+        try:
+            percent_concordant = result[2]/(result[3]+result[2])*100
+        except:
+            percent_concordant = 0
         
-        def combine_written_files(self):#this one is for concordance class
-            to_export = self.cell_concordance_table
-            for val1 in self.record_dict.values():
-                # here remove the int files.
-                print(f"merging temp file: {val1}")
-                with open(val1, 'rb') as f:
-                    loaded_dict = pickle.load(f)
-                    for k1 in loaded_dict.keys():
-                        to_export[k1]=loaded_dict[k1]
-                os.remove(val1)
-            return to_export
+        try:
+            percent_discordant = result[3]/(result[3]+result[2])*100
+        except:
+            percent_discordant = 0
+
+        try:
+            percent_relaxed_concordant = result[4]/(result[4]+result[5])*100
+        except:
+            percent_relaxed_concordant = 0
         
+        try:
+            percent_strict_discordant = result[5]/(result[4]+result[5])*100
+        except:
+            percent_strict_discordant = 0
+
+        try:
+            read_discordance = result[21]/result[15]
+        except:
+            read_discordance = 0
+
+        donor = result[1]
+        cohort = 'UNKNOWN'
+        donor_split = donor.split("_")
+        if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+            cohort = 'UKB'
+        elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+            cohort = 'ELGH'
+
+        print(count)
+        self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
+                                                                'GT 2':result[1],
+                                                                'cohort': cohort,
+                                                                'Nr_Concordant':result[2],
+                                                                'Nr_Discordant':result[3],
+                                                                'Nr_Relaxed_concordant':result[4],
+                                                                'Nr_strict_discordant':result[5],
+                                                                'Percent Concordant':percent_concordant,
+                                                                'Percent Discordant':percent_discordant,
+                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
+                                                                'Percent_strict_discordant': percent_strict_discordant,
+                                                                'Nr_concordant_informative': result[6],
+                                                                'Nr_concordant_uninformative': result[7],
+                                                                'Nr_discordant_informative': result[8],
+                                                                'Nr_discordant_uninformative': result[9],
+                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result[10],
+                                                                'Nr_donor_distinct_sites_within_pool_individuals':result[12],
+                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11],
+                                                                'Discordant_Site_Identities':result[14],
+                                                                'Total_sites': result[15],
+                                                                'Total_informative_sites': result[16],
+                                                                'Total_uninformative_sites': result[17],
+                                                                'Total_reads': result[18],
+                                                                'Total_reads_informative': result[19],
+                                                                'Total_reads_uninformative': result[20],
+                                                                'Discordant_reads': result[21],
+                                                                'Discordant_reads_informtive': result[22],
+                                                                'Discordant_reads_uninformtive': result[23],
+                                                                'Discordant_reads_by_n_sites': read_discordance
+                                                                }   
+        
+        if (count % 200 == 0):
+            print(f'recording and resetting memory {count}')
+            # self.record_dict[count]=self.exclusive_donor_variants
+            self.set_results(self.cell_concordance_table,count)
+            self.reset()  
+        _=""
+    
+    def combine_written_files(self):#this one is for concordance class
+        to_export = self.cell_concordance_table
+        for val1 in self.record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    to_export[k1]=loaded_dict[k1]
+            os.remove(val1)
+        return to_export
+    
+    
+    def conc_table(self):
+        donor_assignments_table=self.donor_assignments_table
+        cell_assignments_table=self.cell_assignments_table
+        exclusive_don_variants=self.exclusive_don_variants
+        exclusive_cell_variants= self.exclusive_cell_variants
         
-        def conc_table(self):
-            donor_assignments_table=self.donor_assignments_table
-            cell_assignments_table=self.cell_assignments_table
-            exclusive_don_variants=self.exclusive_don_variants
-            exclusive_cell_variants= self.exclusive_cell_variants
+        pool = mp.Pool(cpus)
+        count = 0
+        for i,row1 in donor_assignments_table.iterrows():
+            donor_in_question = row1['donor_query']
+            donor_gt_match = row1['donor_gt']
+            if (donor_gt_match=='NONE'):
+                continue
+            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
+            try:
+                expected_vars = exclusive_don_variants[donor_gt_match]
+            except:
+                _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances'
+                continue
+            expected_vars_norm = self.norm_genotypes(expected_vars)
+            try:
+                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                dds = self.donor_distinct_sites[donor_gt_match]
+            except:
+                continue
             
-            pool = mp.Pool(cpus)
-            count = 0
-            for i,row1 in donor_assignments_table.iterrows():
-                donor_in_question = row1['donor_query']
-                donor_gt_match = row1['donor_gt']
-                if (donor_gt_match=='NONE'):
-                    continue
-                Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
-                try:
-                    expected_vars = exclusive_don_variants[donor_gt_match]
-                except:
-                    _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances'
-                    continue
-                expected_vars_norm = self.norm_genotypes(expected_vars)
-                try:
-                    # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
-                    dds = self.donor_distinct_sites[donor_gt_match]
-                except:
-                    continue
+            for cell1 in Cells_to_keep_pre:
+                count+=1
+                # if count>800:
+                #     break
+                cell_vars = exclusive_cell_variants[cell1]
+                # cell_vars_dp = exclusive_cell_variants_dp[cell1]
+
+                self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
+                # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
+                result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count)
+                self.append_results_cell_concordances(result1)
                 
-                for cell1 in Cells_to_keep_pre:
-                    count+=1
-                    # if count>800:
-                    #     break
-                    cell_vars = exclusive_cell_variants[cell1]
-                    # cell_vars_dp = exclusive_cell_variants_dp[cell1]
-
-                    self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
-                    # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
-                    result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count)
-                    self.append_results_cell_concordances(result1)
-                    
-            pool.close()
-            pool.join()
-            output = self.combine_written_files()
-            return output
-        
-        def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
-            Nr_donor_distinct_sites = len(dds)
-            Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
-            Nr_Concordant = len(Concordant_Sites)
-            Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
-            Nr_Discordant = len(Discordant_sites)
-            Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
-            Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
-            Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
-
-            return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, true_discordant_count, relaxed_concordant_informative_count, 
-                    relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
-                    Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, 
-                    uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative]
-        
-        
+        pool.close()
+        pool.join()
+        output = self.combine_written_files()
+        return output
+    
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
+        Nr_donor_distinct_sites = len(dds)
+        Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+        Nr_Concordant = len(Concordant_Sites)
+        Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
+        Nr_Discordant = len(Discordant_sites)
+        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
+        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
+        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+
+        return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, true_discordant_count, relaxed_concordant_informative_count, 
+                relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
+                Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, 
+                uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative]
+    
+    
 class VCF_Loader:
     
     def __init__(self, vcf_file, biallelic_only=True,
@@ -436,7 +432,6 @@ def load_sample_mp(self,line,obs_ids,count,format_list):
         elif list_val[3] == 'A' and list_val[4] == 'G':#remove A>G
             pass
         elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C
-            
             pass
         else:
             list_val2 = list_val[9:]
@@ -787,5 +782,5 @@ def donor_exclusive_sites(exclusive_don_variants2):
 
     if len(result)>0:
         result.to_csv(outfile,sep='\t')
-    print('Processing Done')
     
+    print('Processing Done')
\ No newline at end of file
diff --git a/bin/concordance_calculations_subsample_informative.py b/bin/concordance_calculations_subsample_informative.py
index 4aae389a..b2605971 100755
--- a/bin/concordance_calculations_subsample_informative.py
+++ b/bin/concordance_calculations_subsample_informative.py
@@ -18,506 +18,507 @@
 
 
 class Concordances:
-        def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
-            self.reset()
-            self.donor_assignments_table=donor_assignments_table
-            self.cell_assignments_table=cell_assignments_table
-            self.exclusive_don_variants=exclusive_don_variants
-            self.exclusive_cell_variants=exclusive_cell_variants
-            self.donor_distinct_sites=donor_distinct_sites
-            self.informative_sites = informative_sites
-            self.uninformative_sites = uninformative_sites
-            self.record_dict={}
-
-        def norm_genotypes(self,expected_vars):
-            expected_vars = pd.DataFrame(expected_vars)
-            if len(expected_vars) > 0:
-                split_str=expected_vars[0].str.split("_")
-                expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
-                expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
-                expected_vars['vars'] = split_str.str[4]
-                expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
-                expected_vars = expected_vars[expected_vars['vars']!='./.']
-                expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
-                expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
-            return expected_vars
-        
-        def reset(self):
-            self.cell_concordance_table ={}
-
-        # def get_sites_from_tsv(self, sites_file):
-        #     """
-        #     get sites frm a tsv file where cols are chrom, pos, id, ref, alt
-        #     assumes no multiallelics
-        #     """
-        #     sites = set()
-        #     with open(sites_file, 'r') as f:
-        #         lines = f.readlines()
-        #         for l in lines:
-        #             linedata = l.split('\t')
-        #             var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
-        #             sites.add(var)
-        #     return sites
-
-
-        def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
-            '''
-            take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
-            sites and relaxed concordant sites
-            1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
-            2) if you have a 0/0 you can not get a 1/1 or 0/1
-            3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
-            So - each obversed cellsnp allele must be in the array SNP gtype
-            '''
-            true_discordant = 0
-            relaxed_concordant = 0
-            relaxed_concordant_informative = 0
-            relaxed_concordant_uninformative = 0
-            true_discordant_informative = 0
-            true_discordant_uninformative = 0
-            subset_informative_concordant = 0
-            subset_informative_discordant = 0
-
-            #print(self.uninformative_sites)
-            #print(self.informative_sites)
-
-            #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
-            #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
-            # with the same number of vars (at random)
-            split_snp_gts=snp_gtypes.str.split("_")
-            snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
-
-            split_cellsnp_gts=cellsnp_gtypes.str.split("_")
-            cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
-
-            shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
-
-            shared_informative = shared_gts.intersection(self.informative_sites)
-            shared_uninformative = shared_gts.intersection(self.uninformative_sites)
-            # print("shared informative " + str(len(shared_informative)))
-            # print("shared uninformative " + str(len(shared_uninformative)))
-
-            #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
-            #are the sites used for concordance
-            self.informative_covered = len(shared_informative)
-            self.uninformative_covered = len(shared_uninformative)
+    def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
+        self.reset()
+        self.donor_assignments_table=donor_assignments_table
+        self.cell_assignments_table=cell_assignments_table
+        self.exclusive_don_variants=exclusive_don_variants
+        self.exclusive_cell_variants=exclusive_cell_variants
+        self.donor_distinct_sites=donor_distinct_sites
+        self.informative_sites = informative_sites
+        self.uninformative_sites = uninformative_sites
+        self.record_dict={}
 
-            if len(shared_uninformative) > 0:
-                #print(len(shared_uninformative))
-                # print(len(shared_informative))
-                if len(shared_uninformative) <= len(shared_informative):
-                    informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
-                else:
-                    informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
-                # print(informative_subset)
-                # exit(0)
-            else:
-                informative_subset = set()
+    def norm_genotypes(self,expected_vars):
+        expected_vars = pd.DataFrame(expected_vars)
+        if len(expected_vars) > 0:
+            split_str=expected_vars[0].str.split("_")
+            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+            expected_vars['vars'] = split_str.str[4]
+            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+            expected_vars = expected_vars[expected_vars['vars']!='./.']
+            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+        return expected_vars
 
+    def reset(self):
+        self.cell_concordance_table ={}
+
+    # def get_sites_from_tsv(self, sites_file):
+    #     """
+    #     get sites frm a tsv file where cols are chrom, pos, id, ref, alt
+    #     assumes no multiallelics
+    #     """
+    #     sites = set()
+    #     with open(sites_file, 'r') as f:
+    #         lines = f.readlines()
+    #         for l in lines:
+    #             linedata = l.split('\t')
+    #             var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
+    #             sites.add(var)
+    #     return sites
+
+
+    def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
+        '''
+        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+        sites and relaxed concordant sites
+        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+        2) if you have a 0/0 you can not get a 1/1 or 0/1
+        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+        So - each obversed cellsnp allele must be in the array SNP gtype
+        '''
+        true_discordant = 0
+        relaxed_concordant = 0
+        relaxed_concordant_informative = 0
+        relaxed_concordant_uninformative = 0
+        true_discordant_informative = 0
+        true_discordant_uninformative = 0
+        subset_informative_concordant = 0
+        subset_informative_discordant = 0
+
+        #print(self.uninformative_sites)
+        #print(self.informative_sites)
+
+        #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
+        #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
+        # with the same number of vars (at random)
+        split_snp_gts=snp_gtypes.str.split("_")
+        snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
+
+        split_cellsnp_gts=cellsnp_gtypes.str.split("_")
+        cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
+
+        shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
+
+        shared_informative = shared_gts.intersection(self.informative_sites)
+        shared_uninformative = shared_gts.intersection(self.uninformative_sites)
+        # print("shared informative " + str(len(shared_informative)))
+        # print("shared uninformative " + str(len(shared_uninformative)))
+
+        #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
+        #are the sites used for concordance
+        self.informative_covered = len(shared_informative)
+        self.uninformative_covered = len(shared_uninformative)
+
+        if len(shared_uninformative) > 0:
+            #print(len(shared_uninformative))
+            # print(len(shared_informative))
+            if len(shared_uninformative) <= len(shared_informative):
+                informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
+            else:
+                informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
             # print(informative_subset)
-            self.informative_subset = informative_subset
+            # exit(0)
+        else:
+            informative_subset = set()
 
-            snp_gtypes_set = set(snp_gtypes)
-            snp_gtypes_set = sorted(snp_gtypes_set)
+        # print(informative_subset)
+        self.informative_subset = informative_subset
 
-            cellsnp_gtypes_set = set(cellsnp_gtypes)
-            cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
+        snp_gtypes_set = set(snp_gtypes)
+        snp_gtypes_set = sorted(snp_gtypes_set)
 
-            #for i in range(0, len(snp_gtypes)):
-            for i in range(0, len(snp_gtypes_set)):
-                discordant = False
-                # snp_data = snp_gtypes[i].split('_')
-                # cellsnp_data = cellsnp_gtypes[i].split('_')
-                snp_data = snp_gtypes_set[i].split('_')
-                cellsnp_data = cellsnp_gtypes_set[i].split('_')
+        cellsnp_gtypes_set = set(cellsnp_gtypes)
+        cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
 
-                # the below will no longer work due to differing length of input strings
-                # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
-                # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
+        #for i in range(0, len(snp_gtypes)):
+        for i in range(0, len(snp_gtypes_set)):
+            discordant = False
+            # snp_data = snp_gtypes[i].split('_')
+            # cellsnp_data = cellsnp_gtypes[i].split('_')
+            snp_data = snp_gtypes_set[i].split('_')
+            cellsnp_data = cellsnp_gtypes_set[i].split('_')
 
+            # the below will no longer work due to differing length of input strings
+            # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]]
+            # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]]
 
-                snp_alleles = [snp_data[4][0], snp_data[4][2]]
-                cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
 
-                snp_alleles_set = set(snp_alleles)
-                cellsnp_alleles_set = set(cellsnp_alleles)
-               
-                snp_var = ('_').join(snp_data[0:4])
-                cellsnp_var = ('_').join(cellsnp_data[0:4])
+            snp_alleles = [snp_data[4][0], snp_data[4][2]]
+            cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]]
 
-                if not cellsnp_var == snp_var:
-                    print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
-                    exit(1)
-                else:
-                    for allele in cellsnp_alleles_set:
-                        if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
-                            discordant = True
-                
-                if discordant == True:
-                    true_discordant+=1
-                    if snp_var in self.uninformative_sites:
-                        true_discordant_uninformative+=1
-                    elif snp_var in self.informative_sites:
-                        true_discordant_informative+=1
-                else:
-                    relaxed_concordant+=1
-                    if snp_var in self.uninformative_sites:
-                        relaxed_concordant_uninformative+=1
-                    elif snp_var in self.informative_sites:
-                        relaxed_concordant_informative+=1
+            snp_alleles_set = set(snp_alleles)
+            cellsnp_alleles_set = set(cellsnp_alleles)
+            
+            snp_var = ('_').join(snp_data[0:4])
+            cellsnp_var = ('_').join(cellsnp_data[0:4])
 
-                
-                if len(shared_uninformative) > 0:
-                    if snp_var in informative_subset:
-                        if discordant == True:
-                            subset_informative_discordant+=1
-                        else:
-                            subset_informative_concordant+=1
-
-            # print("conc inf " + str(relaxed_concordant_informative))
-            # print("disc inf " + str(true_discordant_informative))
-
-            return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative, subset_informative_concordant, subset_informative_discordant
-
-
-        def read_condordance(self, expected_vars, cell_vars):
-            '''
-            get read level concordance using DP, AD and OTH format fields
-            ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
-            ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
-            ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
-            '''
-            # print(len(expected_vars))
-            # print(len(cell_vars))
-
-            if not len(expected_vars) == len(cell_vars):
-                print("length mismatch between expected vars and cell vars")
+            if not cellsnp_var == snp_var:
+                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
                 exit(1)
+            else:
+                for allele in cellsnp_alleles_set:
+                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                        discordant = True
+            
+            if discordant == True:
+                true_discordant+=1
+                if snp_var in self.uninformative_sites:
+                    true_discordant_uninformative+=1
+                elif snp_var in self.informative_sites:
+                    true_discordant_informative+=1
+            else:
+                relaxed_concordant+=1
+                if snp_var in self.uninformative_sites:
+                    relaxed_concordant_uninformative+=1
+                elif snp_var in self.informative_sites:
+                    relaxed_concordant_informative+=1
 
-            total_sites = len(expected_vars)
-            #add cols for DP, AD< OTH
-            cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
-            cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
-            cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
-            #split to informative and uninformative sites
-            mask_i = cell_vars['ids'].isin(self.informative_sites)
-            cell_vars_informative = cell_vars[mask_i]
-            mask_u = cell_vars['ids'].isin(self.uninformative_sites)
-            cell_vars_uninformative = cell_vars[mask_u]
-            informative_sites = len(cell_vars_informative)
-            uninformative_sites = len(cell_vars_uninformative)
-            mask_s = cell_vars['ids'].isin(self.informative_subset)
-            cell_vars_informative_subset = cell_vars[mask_s]
-            informative_subset_sites = len(cell_vars_informative_subset)
-            # print("Informative sites " + str(len(self.informative_sites)))
-            # print("uninformative sites " + str(len(self.uninformative_sites)))
-            # print("informative sites in cell vars " + str(len(cell_vars_informative)))
-            # print("uninformative sites in cell vars " + str(len(cell_vars_uninformative)))
-            # print("Informative subset " + str(informative_subset_sites))
-            # print(cell_vars_informative_subset)
-            # exit(0)
+            
+            if len(shared_uninformative) > 0:
+                if snp_var in informative_subset:
+                    if discordant == True:
+                        subset_informative_discordant+=1
+                    else:
+                        subset_informative_concordant+=1
 
-            total_dp = cell_vars['DP'].sum()
-            total_oth = cell_vars['OTH'].sum()
-            total_reads = total_dp + total_oth
-            total_dp_inf = cell_vars_informative['DP'].sum()
-            total_oth_inf = cell_vars_informative['OTH'].sum()
-            total_reads_informative = total_dp_inf + total_oth_inf
-            total_dp_uninf = cell_vars_uninformative['DP'].sum()
-            total_oth_uninf = cell_vars_uninformative['OTH'].sum()
-            total_reads_uninformative = total_dp_uninf + total_oth_uninf
-            total_dp_inf_subset = cell_vars_informative_subset['DP'].sum()
-            total_oth_inf_subset = cell_vars_informative_subset['OTH'].sum()
-            total_reads_informative_subset = total_dp_inf_subset + total_oth_inf_subset
-
-            # expected genotype 0/0
-            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
-            hom_ref_sites = set(expected_hom_ref['ids'])
-            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
-            cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)]
-            cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)]
-            cell_vars_inf_subset_2 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_ref_sites)]
-            ad_hom_ref = cell_vars2['AD'].sum()
-            oth_hom_ref = cell_vars2['OTH'].sum() 
-            discordant_hom_ref = ad_hom_ref + oth_hom_ref
-            ad_hom_ref_inf = cell_vars_inf_2['AD'].sum()
-            oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() 
-            discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf
-            ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum()
-            oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() 
-            discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf
-            ad_hom_ref_inf_subset = cell_vars_inf_subset_2['AD'].sum()
-            oth_hom_ref_inf_subset = cell_vars_inf_subset_2['OTH'].sum()
-            discordant_hom_ref_informative_subset = ad_hom_ref_inf_subset + oth_hom_ref_inf_subset 
-
-            # expected genotype 0/1 or 1/0
-            hets = ['0/1', '1/0']
-            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
-            het_sites = set(expected_het['ids'])
-            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
-            cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)]
-            cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)]
-            cell_vars_inf_subset_3 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(het_sites)]
-            discordant_het = cell_vars3['OTH'].sum()
-            discordant_het_informative = cell_vars_inf_3['OTH'].sum()
-            discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum()
-            discordant_het_informative_subset = cell_vars_inf_subset_3['OTH'].sum()
-
-            # expected genotype 1/1
-            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
-            hom_alt_sites = set(expected_hom_alt['ids'])
-            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
-            cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)]
-            cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)]
-            cell_vars_inf_subset_4 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_alt_sites)]
-            # DP + OTH - AD
-            ad_hom_alt = cell_vars4['AD'].sum()
-            dp_hom_alt = cell_vars4['DP'].sum()
-            oth_hom_alt = cell_vars4['OTH'].sum()
-            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
-            ad_hom_alt_inf = cell_vars_inf_4['AD'].sum()
-            dp_hom_alt_inf = cell_vars_inf_4['DP'].sum()
-            oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum()
-            discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf
-            ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum()
-            dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum()
-            oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum()
-            discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf
-            ad_hom_alt_inf_subset = cell_vars_inf_subset_4['AD'].sum()
-            dp_hom_alt_inf_subset = cell_vars_inf_subset_4['DP'].sum()
-            oth_hom_alt_inf_subset = cell_vars_inf_subset_4['OTH'].sum()
-            discordant_hom_alt_informative_subset = (dp_hom_alt_inf_subset + oth_hom_alt_inf_subset) - ad_hom_alt_inf_subset
-
-            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
-            discordant_reads_informative =  discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative
-            discordant_reads_uninformative =  discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative
-            discordant_reads_informative_subset = discordant_hom_ref_informative_subset + discordant_het_informative_subset + discordant_hom_alt_informative_subset
-
-            return total_sites, self.informative_covered, self.uninformative_covered, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset
-        
-        
+        # print("conc inf " + str(relaxed_concordant_informative))
+        # print("disc inf " + str(true_discordant_informative))
 
-        def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
-            # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
-            # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
-            # Author: M.Ozols
-            
-            cell_vars_norm = self.norm_genotypes(cell_vars)
-
-            if len(cell_vars_norm) > 0:
-                Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
-                expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
-                cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
-                # print(cell_vars_norm)
-                # print(expected_vars2)
-                # print(cell_vars2)
-                # exit(0)
-                Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
-                Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
-                disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
-                df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
-                disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
-                # print(len(disc2))
-                # exit(0)
-                disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
-                disc_sites = ';'.join(disc2['expected_retrieved'])
-                #find truly discordant sites
-                #true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
-                true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(expected_vars2[0], cell_vars2[0])
-                #find discordant reads
-                total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset = self.read_condordance(expected_vars2, cell_vars2)
-            else:
-                Total_Overlapping_sites = set()
-                Concordant_Sites = set()
-                Discordant_sites = set()
-                disc_sites = ''
-                true_discordant_count = 0
-                relaxed_concordant_count = 0
-                total_sites = 0
-                discordant_reads = 0
-
-                informative_subset_sites = 0
-                subset_informative_sites_concordant_count = 0
-                subset_informative_sites_discordant_count = 0
-                total_reads_informative_subset = 0
-                discordant_reads_informative_subset = 0
-                relaxed_concordant_informative_count = 0
-                relaxed_concordant_uninformative_count = 0
-                true_discordant_informative_count = 0
-                true_discordant_uninformative_count = 0
-                total_reads = 0
-                total_reads_informative = 0
-                total_reads_uninformative = 0
-                discordant_reads = 0
-                discordant_reads_informative = 0
-                discordant_reads_uninformative = 0
-                informative_sites = 0
-                uninformative_sites = 0
-                
-            #print(total_sites, informative_sites, uninformative_sites,  relaxed_concordant_informative_count, true_discordant_informative_count, self.informative_covered, self.uninformative_covered)
-            #exit(0)
+        return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative, subset_informative_concordant, subset_informative_discordant
 
-            return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset
-        
 
-        def set_results(self,to_set,id):
-            # Recod to disk to save the loading mmeory time.
-            with open(f'tmp_{id}.pkl', 'wb') as f:
-                pickle.dump(to_set, f)
-            self.record_dict[id]=f'tmp_{id}.pkl'
+    def read_condordance(self, expected_vars, cell_vars):
+        '''
+        get read level concordance using DP, AD and OTH format fields
+        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+        '''
+        # print(len(expected_vars))
+        # print(len(cell_vars))
+
+        if not len(expected_vars) == len(cell_vars):
+            print("length mismatch between expected vars and cell vars")
+            exit(1)
+
+        total_sites = len(expected_vars)
+        #add cols for DP, AD< OTH
+        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        #split to informative and uninformative sites
+        mask_i = cell_vars['ids'].isin(self.informative_sites)
+        cell_vars_informative = cell_vars[mask_i]
+        mask_u = cell_vars['ids'].isin(self.uninformative_sites)
+        cell_vars_uninformative = cell_vars[mask_u]
+        informative_sites = len(cell_vars_informative)
+        uninformative_sites = len(cell_vars_uninformative)
+        mask_s = cell_vars['ids'].isin(self.informative_subset)
+        cell_vars_informative_subset = cell_vars[mask_s]
+        informative_subset_sites = len(cell_vars_informative_subset)
+        # print("Informative sites " + str(len(self.informative_sites)))
+        # print("uninformative sites " + str(len(self.uninformative_sites)))
+        # print("informative sites in cell vars " + str(len(cell_vars_informative)))
+        # print("uninformative sites in cell vars " + str(len(cell_vars_uninformative)))
+        # print("Informative subset " + str(informative_subset_sites))
+        # print(cell_vars_informative_subset)
+        # exit(0)
+
+        total_dp = cell_vars['DP'].sum()
+        total_oth = cell_vars['OTH'].sum()
+        total_reads = total_dp + total_oth
+        total_dp_inf = cell_vars_informative['DP'].sum()
+        total_oth_inf = cell_vars_informative['OTH'].sum()
+        total_reads_informative = total_dp_inf + total_oth_inf
+        total_dp_uninf = cell_vars_uninformative['DP'].sum()
+        total_oth_uninf = cell_vars_uninformative['OTH'].sum()
+        total_reads_uninformative = total_dp_uninf + total_oth_uninf
+        total_dp_inf_subset = cell_vars_informative_subset['DP'].sum()
+        total_oth_inf_subset = cell_vars_informative_subset['OTH'].sum()
+        total_reads_informative_subset = total_dp_inf_subset + total_oth_inf_subset
+          
+
+        # expected genotype 0/0
+        expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+        hom_ref_sites = set(expected_hom_ref['ids'])
+        cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+        cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)]
+        cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)]
+        cell_vars_inf_subset_2 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_ref_sites)]
+        ad_hom_ref = cell_vars2['AD'].sum()
+        oth_hom_ref = cell_vars2['OTH'].sum() 
+        discordant_hom_ref = ad_hom_ref + oth_hom_ref
+        ad_hom_ref_inf = cell_vars_inf_2['AD'].sum()
+        oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() 
+        discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf
+        ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum()
+        oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() 
+        discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf
+        ad_hom_ref_inf_subset = cell_vars_inf_subset_2['AD'].sum()
+        oth_hom_ref_inf_subset = cell_vars_inf_subset_2['OTH'].sum()
+        discordant_hom_ref_informative_subset = ad_hom_ref_inf_subset + oth_hom_ref_inf_subset 
+
+        # expected genotype 0/1 or 1/0
+        hets = ['0/1', '1/0']
+        expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+        het_sites = set(expected_het['ids'])
+        cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+        cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)]
+        cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)]
+        cell_vars_inf_subset_3 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(het_sites)]
+        discordant_het = cell_vars3['OTH'].sum()
+        discordant_het_informative = cell_vars_inf_3['OTH'].sum()
+        discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum()
+        discordant_het_informative_subset = cell_vars_inf_subset_3['OTH'].sum()
+
+        # expected genotype 1/1
+        expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+        hom_alt_sites = set(expected_hom_alt['ids'])
+        cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+        cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)]
+        cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)]
+        cell_vars_inf_subset_4 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_alt_sites)]
+        # DP + OTH - AD
+        ad_hom_alt = cell_vars4['AD'].sum()
+        dp_hom_alt = cell_vars4['DP'].sum()
+        oth_hom_alt = cell_vars4['OTH'].sum()
+        discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+        ad_hom_alt_inf = cell_vars_inf_4['AD'].sum()
+        dp_hom_alt_inf = cell_vars_inf_4['DP'].sum()
+        oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum()
+        discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf
+        ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum()
+        dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum()
+        oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum()
+        discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf
+        ad_hom_alt_inf_subset = cell_vars_inf_subset_4['AD'].sum()
+        dp_hom_alt_inf_subset = cell_vars_inf_subset_4['DP'].sum()
+        oth_hom_alt_inf_subset = cell_vars_inf_subset_4['OTH'].sum()
+        discordant_hom_alt_informative_subset = (dp_hom_alt_inf_subset + oth_hom_alt_inf_subset) - ad_hom_alt_inf_subset
+
+
+        discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+        discordant_reads_informative =  discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative
+        discordant_reads_uninformative =  discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative
+        discordant_reads_informative_subset = discordant_hom_ref_informative_subset + discordant_het_informative_subset + discordant_hom_alt_informative_subset
+
+        return total_sites, self.informative_covered, self.uninformative_covered, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset
+    
+    
+
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
+        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+        # Author: M.Ozols
         
-        def append_results_cell_concordances(self,result):
-            count=result[13]
-            try:
-                percent_concordant = result[2]/(result[3]+result[2])*100
-            except:
-                percent_concordant = 0
+        cell_vars_norm = self.norm_genotypes(cell_vars)
+
+        if len(cell_vars_norm) > 0:
+            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            # print(cell_vars_norm)
+            # print(expected_vars2)
+            # print(cell_vars2)
+            # exit(0)
+            Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+            Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
+            disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+            df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+            disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+            # print(len(disc2))
+            # exit(0)
+            disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+            disc_sites = ';'.join(disc2['expected_retrieved'])
+            #find truly discordant sites
+            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(expected_vars2[0], cell_vars2[0])
+            #find discordant reads
+            total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset = self.read_condordance(expected_vars2, cell_vars2)
+        else:
+            Total_Overlapping_sites = set()
+            Concordant_Sites = set()
+            Discordant_sites = set()
+            disc_sites = ''
+            true_discordant_count = 0
+            relaxed_concordant_count = 0
+            total_sites = 0
+            discordant_reads = 0
+
+            informative_subset_sites = 0
+            subset_informative_sites_concordant_count = 0
+            subset_informative_sites_discordant_count = 0
+            total_reads_informative_subset = 0
+            discordant_reads_informative_subset = 0
+            relaxed_concordant_informative_count = 0
+            relaxed_concordant_uninformative_count = 0
+            true_discordant_informative_count = 0
+            true_discordant_uninformative_count = 0
+            total_reads = 0
+            total_reads_informative = 0
+            total_reads_uninformative = 0
+            discordant_reads = 0
+            discordant_reads_informative = 0
+            discordant_reads_uninformative = 0
+            informative_sites = 0
+            uninformative_sites = 0
             
-            try:
-                percent_discordant = result[3]/(result[3]+result[2])*100
-            except:
-                percent_discordant = 0
+        #print(total_sites, informative_sites, uninformative_sites,  relaxed_concordant_informative_count, true_discordant_informative_count, self.informative_covered, self.uninformative_covered)
+        #exit(0)
 
-            try:
-                percent_relaxed_concordant = result[4]/(result[4]+result[5])*100
-            except:
-                percent_relaxed_concordant = 0
-            
-            try:
-                percent_strict_discordant = result[5]/(result[4]+result[5])*100
-            except:
-                percent_strict_discordant = 0
+        return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset
+    
 
-            try:
-                read_discordance = result[21]/result[15]
-            except:
-                read_discordance = 0
-
-            donor = result[1]
-            cohort = 'UNKNOWN'
-            donor_split = donor.split("_")
-            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
-                cohort = 'UKB'
-            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
-                cohort = 'ELGH'
-
-            print(count)
-            self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
-                                                                    'GT 2':result[1],
-                                                                    'cohort': cohort,
-                                                                    'Nr_Concordant':result[2],
-                                                                    'Nr_Discordant':result[3],
-                                                                    'Nr_Relaxed_concordant':result[4],
-                                                                    'Nr_strict_discordant':result[5],
-                                                                    'Percent Concordant':percent_concordant,
-                                                                    'Percent Discordant':percent_discordant,
-                                                                    'Percent_relaxed_concordant': percent_relaxed_concordant,
-                                                                    'Percent_strict_discordant': percent_strict_discordant,
-                                                                    'Nr_concordant_informative': result[6],
-                                                                    'Nr_concordant_uninformative': result[7],
-                                                                    'Nr_discordant_informative': result[8],
-                                                                    'Nr_discordant_uninformative': result[9],
-                                                                    'NrTotal_Overlapping_sites_between_two_genotypes':result[10],
-                                                                    'Nr_donor_distinct_sites_within_pool_individuals':result[12],
-                                                                    'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11],
-                                                                    'Discordant_Site_Identities':result[14],
-                                                                    'Total_sites': result[15],
-                                                                    'Total_informative_sites': result[16],
-                                                                    'Total_uninformative_sites': result[17],
-                                                                    'Total_reads': result[18],
-                                                                    'Total_reads_informative': result[19],
-                                                                    'Total_reads_uninformative': result[20],
-                                                                    'Discordant_reads': result[21],
-                                                                    'Discordant_reads_informtive': result[22],
-                                                                    'Discordant_reads_uninformtive': result[23],
-                                                                    'Discordant_reads_by_n_sites': read_discordance,
-                                                                    'informative_subset_sites': result[24],
-                                                                    'subset_informative_sites_concordant_count': result[25],
-                                                                    'subset_informative_sites_discordant_count': result[26],
-                                                                    'total_reads_informative_subset': result[27],
-                                                                    'discordant_reads_informative_subset': result[28]
-                                                                    }   
-    #informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset]        
-            if (count % 200 == 0):
-                print(f'recording and resetting memory {count}')
-                # self.record_dict[count]=self.exclusive_donor_variants
-                self.set_results(self.cell_concordance_table,count)
-                self.reset()  
-            _=""
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+    
+    def append_results_cell_concordances(self,result):
+        count=result[13]
+        try:
+            percent_concordant = result[2]/(result[3]+result[2])*100
+        except:
+            percent_concordant = 0
         
-        def combine_written_files(self):#this one is for concordance class
-            to_export = self.cell_concordance_table
-            for val1 in self.record_dict.values():
-                # here remove the int files.
-                print(f"merging temp file: {val1}")
-                with open(val1, 'rb') as f:
-                    loaded_dict = pickle.load(f)
-                    for k1 in loaded_dict.keys():
-                        to_export[k1]=loaded_dict[k1]
-                os.remove(val1)
-            return to_export
+        try:
+            percent_discordant = result[3]/(result[3]+result[2])*100
+        except:
+            percent_discordant = 0
+
+        try:
+            percent_relaxed_concordant = result[4]/(result[4]+result[5])*100
+        except:
+            percent_relaxed_concordant = 0
         
+        try:
+            percent_strict_discordant = result[5]/(result[4]+result[5])*100
+        except:
+            percent_strict_discordant = 0
+
+        try:
+            read_discordance = result[21]/result[15]
+        except:
+            read_discordance = 0
+
+        donor = result[1]
+        cohort = 'UNKNOWN'
+        donor_split = donor.split("_")
+        if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+            cohort = 'UKB'
+        elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+            cohort = 'ELGH'
+
+        print(count)
+        self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
+                                                                'GT 2':result[1],
+                                                                'cohort': cohort,
+                                                                'Nr_Concordant':result[2],
+                                                                'Nr_Discordant':result[3],
+                                                                'Nr_Relaxed_concordant':result[4],
+                                                                'Nr_strict_discordant':result[5],
+                                                                'Percent Concordant':percent_concordant,
+                                                                'Percent Discordant':percent_discordant,
+                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
+                                                                'Percent_strict_discordant': percent_strict_discordant,
+                                                                'Nr_concordant_informative': result[6],
+                                                                'Nr_concordant_uninformative': result[7],
+                                                                'Nr_discordant_informative': result[8],
+                                                                'Nr_discordant_uninformative': result[9],
+                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result[10],
+                                                                'Nr_donor_distinct_sites_within_pool_individuals':result[12],
+                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11],
+                                                                'Discordant_Site_Identities':result[14],
+                                                                'Total_sites': result[15],
+                                                                'Total_informative_sites': result[16],
+                                                                'Total_uninformative_sites': result[17],
+                                                                'Total_reads': result[18],
+                                                                'Total_reads_informative': result[19],
+                                                                'Total_reads_uninformative': result[20],
+                                                                'Discordant_reads': result[21],
+                                                                'Discordant_reads_informtive': result[22],
+                                                                'Discordant_reads_uninformtive': result[23],
+                                                                'Discordant_reads_by_n_sites': read_discordance,
+                                                                'informative_subset_sites': result[24],
+                                                                'subset_informative_sites_concordant_count': result[25],
+                                                                'subset_informative_sites_discordant_count': result[26],
+                                                                'total_reads_informative_subset': result[27],
+                                                                'discordant_reads_informative_subset': result[28]
+                                                                }   
+#informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset]        
+        if (count % 200 == 0):
+            print(f'recording and resetting memory {count}')
+            # self.record_dict[count]=self.exclusive_donor_variants
+            self.set_results(self.cell_concordance_table,count)
+            self.reset()  
+        _=""
+    
+    def combine_written_files(self):#this one is for concordance class
+        to_export = self.cell_concordance_table
+        for val1 in self.record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    to_export[k1]=loaded_dict[k1]
+            os.remove(val1)
+        return to_export
+    
+    
+    def conc_table(self):
+        donor_assignments_table=self.donor_assignments_table
+        cell_assignments_table=self.cell_assignments_table
+        exclusive_don_variants=self.exclusive_don_variants
+        exclusive_cell_variants= self.exclusive_cell_variants
         
-        def conc_table(self):
-            donor_assignments_table=self.donor_assignments_table
-            cell_assignments_table=self.cell_assignments_table
-            exclusive_don_variants=self.exclusive_don_variants
-            exclusive_cell_variants= self.exclusive_cell_variants
+        pool = mp.Pool(cpus)
+        count = 0
+        for i,row1 in donor_assignments_table.iterrows():
+            donor_in_question = row1['donor_query']
+            donor_gt_match = row1['donor_gt']
+            if (donor_gt_match=='NONE'):
+                continue
+            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
+            try:
+                expected_vars = exclusive_don_variants[donor_gt_match]
+            except:
+                _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances'
+                continue
+            expected_vars_norm = self.norm_genotypes(expected_vars)
+            try:
+                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                dds = self.donor_distinct_sites[donor_gt_match]
+            except:
+                continue
             
-            pool = mp.Pool(cpus)
-            count = 0
-            for i,row1 in donor_assignments_table.iterrows():
-                donor_in_question = row1['donor_query']
-                donor_gt_match = row1['donor_gt']
-                if (donor_gt_match=='NONE'):
-                    continue
-                Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
-                try:
-                    expected_vars = exclusive_don_variants[donor_gt_match]
-                except:
-                    _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances'
-                    continue
-                expected_vars_norm = self.norm_genotypes(expected_vars)
-                try:
-                    # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
-                    dds = self.donor_distinct_sites[donor_gt_match]
-                except:
-                    continue
+            for cell1 in Cells_to_keep_pre:
+                count+=1
+                # if count>800:
+                #     break
+                cell_vars = exclusive_cell_variants[cell1]
+                # cell_vars_dp = exclusive_cell_variants_dp[cell1]
+
+                self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
+                # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
+                result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count)
+                self.append_results_cell_concordances(result1)
                 
-                for cell1 in Cells_to_keep_pre:
-                    count+=1
-                    # if count>800:
-                    #     break
-                    cell_vars = exclusive_cell_variants[cell1]
-                    # cell_vars_dp = exclusive_cell_variants_dp[cell1]
-
-                    self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
-                    # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
-                    result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count)
-                    self.append_results_cell_concordances(result1)
-                    
-            pool.close()
-            pool.join()
-            output = self.combine_written_files()
-            return output
-        
-        def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
-            Nr_donor_distinct_sites = len(dds)
-            Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
-            Nr_Concordant = len(Concordant_Sites)
-            #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
-            Nr_Discordant = len(Discordant_sites)
-            Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
-            Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
-            #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
-
-            return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,relaxed_concordant_count, true_discordant_count, relaxed_concordant_informative_count, 
-                    relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
-                    Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, 
-                    uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative,
-                    informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset]
-        
-        
+        pool.close()
+        pool.join()
+        output = self.combine_written_files()
+        return output
+    
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count):
+        Nr_donor_distinct_sites = len(dds)
+        Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+        Nr_Concordant = len(Concordant_Sites)
+        #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
+        Nr_Discordant = len(Discordant_sites)
+        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
+        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
+        #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+
+        return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,relaxed_concordant_count, true_discordant_count, relaxed_concordant_informative_count, 
+                relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
+                Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, 
+                uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative,
+                informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset]
+    
+    
 class VCF_Loader:
     
     def __init__(self, vcf_file, biallelic_only=True,
diff --git a/bin/find_discordant_sites_in_other_donors_find_best_donor.py b/bin/find_discordant_sites_in_other_donors_find_best_donor.py
index 2cc45b21..becc8733 100755
--- a/bin/find_discordant_sites_in_other_donors_find_best_donor.py
+++ b/bin/find_discordant_sites_in_other_donors_find_best_donor.py
@@ -221,6 +221,22 @@ def load_VCF_batch_paralel(self):
         output = self.combine_written_files()
         return output
     
+def get_options():
+    '''
+    Get options from the command line
+    '''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cpus', action='store', required=True, type=int)
+    parser.add_argument('--cell_vcf', action='store', required=True)
+    parser.add_argument('--cell_assignments', action='store', required=True)
+    parser.add_argument('--donor_assignments', action='store', required=True)
+    parser.add_argument('--gt_match_vcf', action='store', required=True)
+    parser.add_argument('--expected_vcf', action='store', required=True)
+    parser.add_argument('--outfile', action='store', required=True)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    return args
 
 class Concordances:
     def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites):
@@ -232,7 +248,6 @@ def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_
         self.donor_distinct_sites=donor_distinct_sites
         self.record_dict={}
 
-
     def norm_genotypes(self,expected_vars):
         expected_vars = pd.DataFrame(expected_vars)
         if len(expected_vars) > 0:
@@ -245,7 +260,6 @@ def norm_genotypes(self,expected_vars):
             expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
             expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
         return expected_vars
-        
 
     def reset(self):
         self.cell_concordance_table ={}
@@ -580,8 +594,6 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
 
 def find(lst, a):
     return [i for i, x in enumerate(lst) if x==a ]
-    
-    
 def norm_genotypes(expected_vars):
     expected_vars = pd.DataFrame(expected_vars)
     split_str=expected_vars[0].str.split("_")
@@ -625,22 +637,7 @@ def donor_exclusive_sites(exclusive_don_variants2):
     return donor_distinct_sites   
 
 
-def get_options():
-    '''
-    Get options from the command line
-    '''
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--cpus', action='store', required=True, type=int)
-    parser.add_argument('--cell_vcf', action='store', required=True)
-    parser.add_argument('--cell_assignments', action='store', required=True)
-    parser.add_argument('--donor_assignments', action='store', required=True)
-    parser.add_argument('--gt_match_vcf', action='store', required=True)
-    parser.add_argument('--expected_vcf', action='store', required=True)
-    parser.add_argument('--outfile', action='store', required=True)
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
 
-    return args
 
 
 if __name__ == "__main__":
@@ -731,13 +728,11 @@ def get_options():
             pickle.dump(donor_distinct_sites, f)
         
     print('---donor_distinct_sites calculated----')
-
-
+    
     conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites)
     cell_concordance_table = conc1.conc_table()
     
     result = pd.DataFrame(cell_concordance_table).T
-    # result.to_csv(outfile,sep='\t')
     try:
         site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']]
         result.drop(columns=['Concordant_Site_Identities'],inplace=True)
diff --git a/bin/find_discordant_sites_in_other_donors_noA2G.py b/bin/find_discordant_sites_in_other_donors_noA2G.py
index b747a749..10de94c8 100755
--- a/bin/find_discordant_sites_in_other_donors_noA2G.py
+++ b/bin/find_discordant_sites_in_other_donors_noA2G.py
@@ -15,228 +15,34 @@
 __date__ = '2023-07-24'
 __version__ = '0.0.1'
 import argparse
+import sys
+import importlib.util
 import pickle 
 import pandas as pd
+import gzip
 import random
 import numpy as np
+import time
 import multiprocessing as mp
 from multiprocessing import Lock
+import logging
 import os
 import gzip
 import time
 
-
-class VCF_Loader:
-    
-    def __init__(self, vcf_file, biallelic_only=True,
-                        sparse=False, format_list=['GT']):
-        self.vcf_file = vcf_file
-        self.load_sample = True
-        self.biallelic_only = biallelic_only
-        self.sparse = sparse
-        self.record_dict={}
-        self.reset()
-        self.format_list = format_list
-        self.exclusive_donor_variants = {}
-        self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
-        self.last_count=-1
-        self.reset_c()
-    
-    def reset_c(self):
-        self.record_times=0
-        
-    def reset(self):
-        self.exclusive_donor_variants ={}
-                
-    def myfunc(self):
-        print(f"Hello my name is {self.biallelic_only}" )
-        
-    def load_sample_mp(self,line,obs_ids,count,format_list):
-        '''
-        takes VCF lines and extracts all format fields for those where GT !='.'
-        '''
-        list_val = line.rstrip().split("\t") #[:5] #:8
-        idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called
-        if len(list_val[3]) > 1 or len(list_val[4]) > 1:
-            # CURRENTLY DEALS ONLY WITH BIALELIC
-            print(f'{idx} var not bialelic')
-        elif list_val[3] == 'A' and list_val[4] == 'G':#remove A>G
-            pass
-        elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C
-            pass
-        else:
-            list_val2 = list_val[9:]
-            obs = pd.DataFrame(obs_ids)
-            lv = pd.DataFrame(list_val2)
-            lv_proc =lv[0].str.split(':').str[idx]
-            gt_exists = lv_proc[lv_proc != '.']
-            idx2 = gt_exists.index
-            obs_with_gt = obs.loc[idx2.values]
-            obs_with_gt = list(obs_with_gt[0].values)
-            list_val_with_gt = lv.loc[idx2.values]
-            list_val_with_gt = list(list_val_with_gt[0].values)
-            random.seed(count)
-            c = list(zip(obs_with_gt, list_val_with_gt))
-            random.shuffle(c)
-            obs_with_gt, list_val_with_gt = zip(*c)
-            # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count])
-
-        return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step
-
-
-    def set_results(self,to_set,id):
-        # Recod to disk to save the loading mmeory time.
-        with open(f'tmp_{id}.pkl', 'wb') as f:
-            pickle.dump(to_set, f)
-        self.record_dict[id]=f'tmp_{id}.pkl'
-    
-
-    def append_results(self,result):
-        # exclusive_donor_variants
-        obs_with_gt= result[0]
-        list_val_with_gt= result[1]
-        idx = result[2]
-        list_val = result[3]
-        count = result[4]
-        format_list = result[5]#list of required format fields
-        #get indexes of required format fields (apart from GT which has already been taken care of)
-        additional_field_idxs = []
-        for fmt in format_list:
-            if not fmt == 'GT':
-                idx_addn = find(list_val[8].split(':'), fmt)[0]
-                additional_field_idxs.append(idx_addn)
-        # print(additional_field_idxs)
-        # exit(0)
-
-        count11=0
-        # r = random.random()
-        # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer.
-        # every 500 itterations we push the data to a dictionary, later we combine these together.
-        if (count % 200 == 0):
-            print(f'recording and resetting memory {count}')
-            # self.record_dict[count]=self.exclusive_donor_variants
-            self.set_results(self.exclusive_donor_variants,count)
-            self.reset()  
-            self.reset_c()        
-        
-        for ob_id in obs_with_gt:
-            donor_loc_in_list = count11
-            alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx]
-            #append any additional format fields to alleles
-            if len(additional_field_idxs) > 0:
-                for idx_addnl in additional_field_idxs:
-                    fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl]
-                    alleles = alleles + '_' + fmt_val
-
-            if not alleles.startswith('.'):
-                ids = "_".join([list_val[x] for x in [0, 1, 3, 4]])
-                donor_var = f"{ids}_{alleles}"
-                while ob_id in self.curently_pushing:
-                    time.sleep(r*0.01)
-                self.curently_pushing.append(ob_id)           
-                try:
-                    self.exclusive_donor_variants[ob_id].add(donor_var)
-                    self.record_times=self.record_times+1
-                except:
-                    self.exclusive_donor_variants[ob_id]=set()
-                    self.exclusive_donor_variants[ob_id].add(donor_var)
-                    self.record_times=self.record_times+1
-                self.curently_pushing.remove(ob_id)
-                # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1']
-            count11+=1 
-
-    def combine_written_files(self):#this is for VCF loader class
-        to_export = self.exclusive_donor_variants
-        for val1 in self.record_dict.values():
-            # here remove the int files.
-            print(f"merging temp file: {val1}")
-            with open(val1, 'rb') as f:
-                loaded_dict = pickle.load(f)
-                for k1 in loaded_dict.keys():
-                    try:
-                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
-                    except:
-                        to_export[k1]=set()
-                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
-            os.remove(val1)
-        return to_export
-    
-    
-    def load_VCF_batch_paralel(self):
-        """
-        Load whole VCF file by utilising multiple cores to speed up loading of large cell files
-        -------------------
-        Initially designed to load VCF from cellSNP output, requiring 
-        1) all variants have the same format list;
-        2) a line starting with "#CHROM", with sample ids.
-        If these two requirements are satisfied, this function also supports general
-        VCF files, e.g., genotype for multiple samples.
-
-        Note, it may take a large memory, please filter the VCF with bcftools first.
-        """
-        
-        vcf_file = self.vcf_file
-        biallelic_only = self.biallelic_only
-        load_sample= self.load_sample
-        sparse = self.sparse
-        format_list= self.format_list
-        pool = mp.Pool(cpus)
-        
-        
-        import time
-        if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz":
-            infile = gzip.open(vcf_file, "rb")
-            is_gzip = True
-        else:
-            infile = open(vcf_file, "r")
-            is_gzip = False
-        
-        FixedINFO = {}
-        contig_lines = []
-        comment_lines = []
-        var_ids, obs_ids, obs_dat = [], [], []
-        count=0 #57077    
-        for line in infile:
-            count+=1
-            # if count>10000:
-            #     break
-            if is_gzip:
-                line = line.decode('utf-8')
-            if line.startswith("#"):
-                if line.startswith("##contig="):
-                    contig_lines.append(line.rstrip())
-                if line.startswith("#CHROM"):
-                    if load_sample:
-                        obs_ids = line.rstrip().split("\t")[9:]
-                        for ob_id in obs_ids:
-                            self.exclusive_donor_variants[ob_id]=set()
-                    key_ids = line[1:].rstrip().split("\t")[:8]
-                    for _key in key_ids:
-                        FixedINFO[_key] = []
-                else:
-                    comment_lines.append(line.rstrip())
-            else:
-                pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results)
-                del line
-        self.last_count=count
-        pool.close()
-        pool.join()
-        
-        output = self.combine_written_files()
-        return output
-    
-
+remove_ag=True
 class Concordances:
-    def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites):
+    def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites):
         self.reset()
         self.donor_assignments_table=donor_assignments_table
         self.cell_assignments_table=cell_assignments_table
         self.exclusive_don_variants=exclusive_don_variants
         self.exclusive_cell_variants=exclusive_cell_variants
         self.donor_distinct_sites=donor_distinct_sites
+        self.informative_sites = informative_sites
+        self.uninformative_sites = uninformative_sites
         self.record_dict={}
 
-
     def norm_genotypes(self,expected_vars):
         expected_vars = pd.DataFrame(expected_vars)
         if len(expected_vars) > 0:
@@ -249,7 +55,6 @@ def norm_genotypes(self,expected_vars):
             expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
             expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
         return expected_vars
-        
 
     def reset(self):
         self.cell_concordance_table ={}
@@ -271,38 +76,76 @@ def read_condordance(self, expected_vars, cell_vars):
         cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
         cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
         cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        #split to informative and uninformative sites
+        mask_i = cell_vars['ids'].isin(self.informative_sites)
+        cell_vars_informative = cell_vars[mask_i]
+        mask_u = cell_vars['ids'].isin(self.uninformative_sites)
+        cell_vars_uninformative = cell_vars[mask_u]
+        informative_sites = len(cell_vars_informative)
+        uninformative_sites = len(cell_vars_uninformative)
+
         total_dp = cell_vars['DP'].sum()
         total_oth = cell_vars['OTH'].sum()
         total_reads = total_dp + total_oth
+        total_dp_inf = cell_vars_informative['DP'].sum()
+        total_oth_inf = cell_vars_informative['OTH'].sum()
+        total_reads_informative = total_dp_inf + total_oth_inf
+        total_dp_uninf = cell_vars_uninformative['DP'].sum()
+        total_oth_uninf = cell_vars_uninformative['OTH'].sum()
+        total_reads_uninformative = total_dp_uninf + total_oth_uninf            
 
         # expected genotype 0/0
         expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
         hom_ref_sites = set(expected_hom_ref['ids'])
         cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+        cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)]
+        cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)]
         ad_hom_ref = cell_vars2['AD'].sum()
         oth_hom_ref = cell_vars2['OTH'].sum() 
         discordant_hom_ref = ad_hom_ref + oth_hom_ref
+        ad_hom_ref_inf = cell_vars_inf_2['AD'].sum()
+        oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() 
+        discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf
+        ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum()
+        oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() 
+        discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf
 
         # expected genotype 0/1 or 1/0
         hets = ['0/1', '1/0']
         expected_het = expected_vars[expected_vars['vars'].isin(hets)]
         het_sites = set(expected_het['ids'])
         cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+        cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)]
+        cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)]
         discordant_het = cell_vars3['OTH'].sum()
+        discordant_het_informative = cell_vars_inf_3['OTH'].sum()
+        discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum()
 
         # expected genotype 1/1
         expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
         hom_alt_sites = set(expected_hom_alt['ids'])
         cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+        cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)]
+        cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)]
         # DP + OTH - AD
         ad_hom_alt = cell_vars4['AD'].sum()
         dp_hom_alt = cell_vars4['DP'].sum()
         oth_hom_alt = cell_vars4['OTH'].sum()
         discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
+        ad_hom_alt_inf = cell_vars_inf_4['AD'].sum()
+        dp_hom_alt_inf = cell_vars_inf_4['DP'].sum()
+        oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum()
+        discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf
+        ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum()
+        dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum()
+        oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum()
+        discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf
 
         discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+        discordant_reads_informative =  discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative
+        discordant_reads_uninformative =  discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative
 
-        return total_sites, total_reads, discordant_reads
+        return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative
 
 
     def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
@@ -316,7 +159,12 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
         '''
         true_discordant = 0
         relaxed_concordant = 0
+        relaxed_concordant_informative = 0
+        relaxed_concordant_uninformative = 0
+        true_discordant_informative = 0
+        true_discordant_uninformative = 0
         discordant_vars = []
+        concordant_vars = []
 
         for i in range(0, len(snp_gtypes)):
             discordant = False
@@ -348,20 +196,34 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes):
             if discordant == True:
                 true_discordant+=1
                 discordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    true_discordant_uninformative+=1
+                elif snp_var in self.informative_sites:
+                    true_discordant_informative+=1
             else:
                 relaxed_concordant+=1
+                concordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    relaxed_concordant_uninformative+=1
+                elif snp_var in self.informative_sites:
+                    relaxed_concordant_informative+=1
+
+        return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative,discordant_vars
 
-        return true_discordant, relaxed_concordant, discordant_vars
-    
 
-    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):            
+
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
+        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+        # Author: M.Ozols
+        
         cell_vars_norm = self.norm_genotypes(cell_vars)
+
         if len(cell_vars_norm) > 0:
-            Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
-            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)]
-            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)]
+            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
             Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
-            #find sites that may be discordant - this will include hets which are not truly discordant
             Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])
             disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
             df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
@@ -369,12 +231,12 @@ def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
             disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
             #disc_sites = ';'.join(disc2['expected_retrieved'])
 
-            true_discordant_count, relaxed_concordant_count, discordant_vars = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
+            true_discordant_count, relaxed_concordant_count, discordant_vars, concordant_vars = self.get_strict_discordance(disc2['0_y'], disc2['0_x'])
             total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count
             #find discordant reads
             total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2)
 
-            return total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars
+            return total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars, concordant_vars
 
 
     def set_results(self,to_set,id):
@@ -382,8 +244,8 @@ def set_results(self,to_set,id):
         with open(f'tmp_{id}.pkl', 'wb') as f:
             pickle.dump(to_set, f)
         self.record_dict[id]=f'tmp_{id}.pkl'
-        
-    def append_results_cell_concordances(self,result):
+    
+    def append_results_cell_concordances(self,result,cell_concordance_table):
         #[cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, 
         # discordant_reads, discordant_vars,discordant_vars_in_pool_str, count]
         count=result[10]
@@ -398,28 +260,38 @@ def append_results_cell_concordances(self,result):
         except:
             read_discordance = 0
 
-        print(count)
-        self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0],
-                                                                'GT 2':result[1],
-                                                                'Cohort': result[2],
-                                                                'Nr_Concordant':result[5],
-                                                                'Nr_Discordant':result[4],
-                                                                'Percent_Discordant':percent_discordant,
-                                                                'Total_sites': result[3],
-                                                                'Total_reads': result[6],
-                                                                'Discordant_reads': result[7],
-                                                                'Discordant_reads_by_n_sites': read_discordance,
-                                                                'Discordant_sites_in_pool': result[9],
-                                                                'Discordant_Site_Identities':(';').join(result[8])
+        # print(count)
+        same_as_asigned_donor = result[12]==result[1]
+        cell_concordance_table[f'{result[0]} --- {result[1]}'] = {  'GT 1':result[0],
+                                                                    'GT 2':result[1],
+                                                                    'Cohort': result[2],
+                                                                    'Nr_Concordant':result[5],
+                                                                    'Nr_Discordant':result[4],
+                                                                    'Percent_Discordant':percent_discordant,
+                                                                    'Total_sites': result[3],
+                                                                    'Total_reads': result[6],
+                                                                    'Discordant_reads': result[7],
+                                                                    'Discordant_reads_by_n_sites': read_discordance,
+                                                                    'Discordant_sites_in_pool': result[9],
+                                                                    'Discordant_Site_Identities':(';').join(result[8]),
+                                                                    'Lowest_Disconcordance_value_in_all_donors':result[11],
+                                                                    'Donor_With_Lowest_DisConcordance':result[12],
+                                                                    'Concordant_Site_Identities':result[13],
+                                                                    'same_as_asigned_donor':same_as_asigned_donor,
+                                                                    'Donor_With_Highest_Concordance':result[14],
+                                                                    'Highest_Concordance_value_in_all_donors':result[15],
+                                                                    'Total_sites_other_donor':result[16],
+                                                                    'Total_reads_other_donor':result[17]
                                                                 }   
         
-        if (count % 200 == 0):
-            print(f'recording and resetting memory {count}')
-            # self.record_dict[count]=self.exclusive_donor_variants
-            self.set_results(self.cell_concordance_table,count)
-            self.reset()  
-        _=""
-        
+        # if (count % 200 == 0):
+        #     print(f'recording and resetting memory {count}')
+        #     # self.record_dict[count]=self.exclusive_donor_variants
+        #     self.set_results(self.cell_concordance_table,count)
+        #     self.reset()  
+        # _=""
+        return cell_concordance_table
+    
     def combine_written_files(self):#this one is for concordance class
         to_export = self.cell_concordance_table
         for val1 in self.record_dict.values():
@@ -432,6 +304,27 @@ def combine_written_files(self):#this one is for concordance class
             os.remove(val1)
         return to_export
 
+    def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm):
+        donor_concordance_table = {}
+        for cell1 in Cells_to_keep_pre:
+            count+=1
+            # if count>10:
+            #     break
+            cell_vars = exclusive_cell_variants[cell1]
+            # cell_vars_dp = exclusive_cell_variants_dp[cell1]
+
+            # self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
+            # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
+            result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data)
+            # if (result1==None):
+            #     _='test'
+            donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table)
+            # print('Done')
+        return donor_concordance_table
+
+    def combine_concordances(self,result):
+        # print('res')
+        self.cell_concordance_table = self.cell_concordance_table | result
 
     def conc_table(self):
         donor_assignments_table=self.donor_assignments_table
@@ -463,6 +356,15 @@ def conc_table(self):
                 cohort = 'ELGH'
             donor_cohorts[don_id] = cohort
 
+        all_donor_data={}
+        # here we calvculate all the expected donor datasets
+        for row1 in exclusive_don_variants.keys():
+            # donor_in_question = row1['donor_query']
+            donor_gt_match = row1
+            expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match]
+            expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
+            all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
+
         for i,row1 in donor_assignments_table.iterrows():
             donor_in_question = row1['donor_query']
             donor_gt_match = row1['donor_gt']
@@ -480,36 +382,30 @@ def conc_table(self):
                 dds = self.donor_distinct_sites[donor_gt_match]
             except:
                 continue
-            
-            for cell1 in Cells_to_keep_pre:
-                count+=1
-                # if count>800:
-                #     break
-                cell_vars = exclusive_cell_variants[cell1]
-                # cell_vars_dp = exclusive_cell_variants_dp[cell1]
-
-                self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={}
-                # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances)          
-                result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count)
-                self.append_results_cell_concordances(result1)
+            if cpus==1:
+                result_conc = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm)
+                self.combine_concordances(result_conc)
+            else:
+                pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances)          
                 
         pool.close()
         pool.join()
         output = self.combine_written_files()
         return output
+
     
-        
-    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count):
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data):
         #Nr_donor_distinct_sites = len(dds)
-        total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
-        #Nr_Concordant = len(Concordant_Sites)
-        #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
-        #Nr_Discordant = len(Discordant_sites)
-        #Nr_Total_Overlapping_sites = len(Total_Overlappin_sites)
-        #Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
-        #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+        total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars, concordant_vars = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+        Nr_Concordant = len(Concordant_Sites)
+        Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count
+        Nr_Discordant = len(Discordant_sites)
+        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
+        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites)))
+        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
         #find if the discordant vars are in any of the other donors
         discordant_vars_in_pool = []
+        donor_table_of_concordances = []
         for donor in vars_per_donor_gt:
             if not donor == donor_gt_match:
                 try:
@@ -521,17 +417,273 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
                 common_var_count = str(len(common_vars))
                 donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
                 discordant_vars_in_pool.append(donor_cohort_common)
+                
+                # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment.
+                # We do this to investigate the potential of a cell coming from this other donor.
+            
+            expected_vars_norm_of_other_donor = all_donor_data[donor]
+            total_sites_otherDonor, true_discordant_count_otherDonor, total_concordant_sites_otherDonor, total_reads_otherDonor, discordant_reads_otherDonor, discordant_vars_otherDonor, concordant_vars_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars)
+            concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100
+            discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100
+            donor_table_of_concordances.append({'donor':donor,'concordant_percent_in_other_donor':concordant_percent_in_other_donor,'discordant_percent_in_other_donor':discordant_percent_in_other_donor,'total_sites_otherDonor':total_sites_otherDonor,'total_reads_otherDonor':total_reads_otherDonor})
+                
         discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)
-
-        return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count]
+        concordant_vars_in_pool_str = (";").join(concordant_vars)
+        DF = pd.DataFrame(donor_table_of_concordances)
+        Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values)
+        Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
+        
+        Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values)
+        Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0]
+        Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values)
+        Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values)
+        
+        return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str,Donor_With_Highest_Concordance,Highest_Concordance_value_in_all_donors,Total_sites_other_donor,Total_reads_other_donor]
         #return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites,
         #        Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads]
     
 
-def find(lst, a):
-    return [i for i, x in enumerate(lst) if x==a ]
+
+class VCF_Loader:
     
+    def __init__(self, vcf_file, biallelic_only=True,
+                        sparse=False, format_list=['GT']):
+        self.vcf_file = vcf_file
+        self.load_sample = True
+        self.biallelic_only = biallelic_only
+        self.sparse = sparse
+        self.record_dict={}
+        self.reset()
+        self.format_list = format_list
+        self.exclusive_donor_variants = {}
+        self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
+        self.last_count=-1
+        self.reset_c()
     
+    def reset_c(self):
+        self.record_times=0
+        
+    def reset(self):
+        self.exclusive_donor_variants ={}
+                
+    def myfunc(self):
+        print(f"Hello my name is {self.biallelic_only}" )
+        
+    def load_sample_mp(self,line,obs_ids,count,format_list):
+        '''
+        takes VCF lines and extracts all format fields for those where GT !='.'
+        '''
+        list_val = line.rstrip().split("\t") #[:5] #:8
+        idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called
+        if len(list_val[3]) > 1 or len(list_val[4]) > 1:
+            # CURRENTLY DEALS ONLY WITH BIALELIC
+            print(f'{idx} var not bialelic')
+            if remove_ag:
+                if list_val[3] == 'A' and list_val[4] == 'G':#remove A>G
+                    pass
+                elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C
+                    pass
+        else:
+            list_val2 = list_val[9:]
+            obs = pd.DataFrame(obs_ids)
+            lv = pd.DataFrame(list_val2)
+            lv_proc =lv[0].str.split(':').str[idx]
+            gt_exists = lv_proc[lv_proc != '.']
+            idx2 = gt_exists.index
+            obs_with_gt = obs.loc[idx2.values]
+            obs_with_gt = list(obs_with_gt[0].values)
+            list_val_with_gt = lv.loc[idx2.values]
+            list_val_with_gt = list(list_val_with_gt[0].values)
+            random.seed(count)
+            c = list(zip(obs_with_gt, list_val_with_gt))
+            random.shuffle(c)
+            obs_with_gt, list_val_with_gt = zip(*c)
+            # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count])
+
+        return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step
+
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        with open(f'tmp_{id}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}.pkl'
+    
+
+    def append_results(self,result):
+        # exclusive_donor_variants
+        obs_with_gt= result[0]
+        list_val_with_gt= result[1]
+        idx = result[2]
+        list_val = result[3]
+        count = result[4]
+        format_list = result[5]#list of required format fields
+        #get indexes of required format fields (apart from GT which has already been taken care of)
+        additional_field_idxs = []
+        for fmt in format_list:
+            if not fmt == 'GT':
+                idx_addn = find(list_val[8].split(':'), fmt)[0]
+                additional_field_idxs.append(idx_addn)
+        # print(additional_field_idxs)
+        # exit(0)
+
+        count11=0
+        # r = random.random()
+        # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer.
+        # every 500 itterations we push the data to a dictionary, later we combine these together.
+        if (count % 200 == 0):
+            print(f'recording and resetting memory {count}')
+            # self.record_dict[count]=self.exclusive_donor_variants
+            self.set_results(self.exclusive_donor_variants,count)
+            self.reset()  
+            self.reset_c()        
+        
+        for ob_id in obs_with_gt:
+            donor_loc_in_list = count11
+            alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx]
+            #append any additional format fields to alleles
+            if len(additional_field_idxs) > 0:
+                for idx_addnl in additional_field_idxs:
+                    fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl]
+                    alleles = alleles + '_' + fmt_val
+
+            if not alleles.startswith('.'):
+                ids = "_".join([list_val[x] for x in [0, 1, 3, 4]])
+                donor_var = f"{ids}_{alleles}"
+                while ob_id in self.curently_pushing:
+                    time.sleep(r*0.01)
+                self.curently_pushing.append(ob_id)           
+                try:
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                except:
+                    self.exclusive_donor_variants[ob_id]=set()
+                    self.exclusive_donor_variants[ob_id].add(donor_var)
+                    self.record_times=self.record_times+1
+                self.curently_pushing.remove(ob_id)
+                # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1']
+            count11+=1 
+
+    def combine_written_files(self):#this is for VCF loader class
+        to_export = self.exclusive_donor_variants
+        for val1 in self.record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    try:
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+                    except:
+                        to_export[k1]=set()
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+            os.remove(val1)
+        return to_export
+    
+    
+    def load_VCF_batch_paralel(self):
+        """
+        Load whole VCF file by utilising multiple cores to speed up loading of large cell files
+        -------------------
+        Initially designed to load VCF from cellSNP output, requiring 
+        1) all variants have the same format list;
+        2) a line starting with "#CHROM", with sample ids.
+        If these two requirements are satisfied, this function also supports general
+        VCF files, e.g., genotype for multiple samples.
+
+        Note, it may take a large memory, please filter the VCF with bcftools first.
+        """
+        
+        vcf_file = self.vcf_file
+        biallelic_only = self.biallelic_only
+        load_sample= self.load_sample
+        sparse = self.sparse
+        format_list= self.format_list
+        pool = mp.Pool(cpus)
+        
+        
+        import time
+        if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz":
+            infile = gzip.open(vcf_file, "rb")
+            is_gzip = True
+        else:
+            infile = open(vcf_file, "r")
+            is_gzip = False
+        
+        FixedINFO = {}
+        contig_lines = []
+        comment_lines = []
+        var_ids, obs_ids, obs_dat = [], [], []
+        count=0 #57077    
+        for line in infile:
+            count+=1
+            # if count>10000:
+            #     break
+            if is_gzip:
+                line = line.decode('utf-8')
+            if line.startswith("#"):
+                if line.startswith("##contig="):
+                    contig_lines.append(line.rstrip())
+                if line.startswith("#CHROM"):
+                    if load_sample:
+                        obs_ids = line.rstrip().split("\t")[9:]
+                        for ob_id in obs_ids:
+                            self.exclusive_donor_variants[ob_id]=set()
+                    key_ids = line[1:].rstrip().split("\t")[:8]
+                    for _key in key_ids:
+                        FixedINFO[_key] = []
+                else:
+                    comment_lines.append(line.rstrip())
+            else:
+                pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results)
+                del line
+        self.last_count=count
+        pool.close()
+        pool.join()
+        
+        output = self.combine_written_files()
+        return output
+    
+"""Run CLI."""
+
+def get_options():
+    '''
+    Get options from the command line
+    '''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__))
+    parser.add_argument('--cpus', action='store', required=True, type=int)
+    parser.add_argument('--cell_vcf', action='store', required=True)
+    parser.add_argument('--cell_assignments', action='store', required=True)
+    parser.add_argument('--donor_assignments', action='store', required=True)
+    parser.add_argument('--gt_match_vcf', action='store', required=True)
+    parser.add_argument('--expected_vcf', action='store', required=True)
+    parser.add_argument('--informative_sites', action='store', required=True)
+    parser.add_argument('--uninformative_sites', action='store', required=True)
+    parser.add_argument('--outfile', action='store', required=True)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    return args
+
+
+def get_sites_from_tsv(sites_file):
+    """
+    get sites frm a tsv file where cols are chrom, pos, id, ref, alt
+    assumes no multiallelics
+    """
+    sites = set()
+    with open(sites_file, 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            linedata = l.split('\t')
+            var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]])
+            sites.add(var)
+    return sites
+    
+
+def find(lst, a):
+    return [i for i, x in enumerate(lst) if x==a ]
 def norm_genotypes(expected_vars):
     expected_vars = pd.DataFrame(expected_vars)
     split_str=expected_vars[0].str.split("_")
@@ -575,34 +727,23 @@ def donor_exclusive_sites(exclusive_don_variants2):
     return donor_distinct_sites   
 
 
-def get_options():
-    '''
-    Get options from the command line
-    '''
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--cpus', action='store', required=True, type=int)
-    parser.add_argument('--cell_vcf', action='store', required=True)
-    parser.add_argument('--cell_assignments', action='store', required=True)
-    parser.add_argument('--donor_assignments', action='store', required=True)
-    parser.add_argument('--gt_match_vcf', action='store', required=True)
-    parser.add_argument('--expected_vcf', action='store', required=True)
-    parser.add_argument('--outfile', action='store', required=True)
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-
-    return args
 
 
 if __name__ == "__main__":
 
     options = get_options()
     cpus = options.cpus
+    outfile = options.outfile
     cell_vcf=options.cell_vcf
     donor_assignments=options.donor_assignments
     gt_match_vcf=options.gt_match_vcf
     expected_vcf=options.expected_vcf
     cell_assignments=options.cell_assignments
-    outfile = options.outfile
+    informative_sites_file = options.informative_sites
+    uninformative_sites_file = options.uninformative_sites
+
+    informative_sites = get_sites_from_tsv(informative_sites_file)
+    uninformative_sites = get_sites_from_tsv(uninformative_sites_file)
 
     exclusive_donor_variants = {} #This is where results are populated when mp process i used.
     curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update
@@ -637,16 +778,12 @@ def get_options():
             pickle.dump(GT_Matched_variants, f)
         
         print('---Loading cell VCF----')
-        tic = time.perf_counter()
         loader1 = VCF_Loader(cell_vcf, biallelic_only=True,
                             sparse=False, format_list=['GT', 'DP', 'AD', 'OTH'])
         exclusive_cell_variants = loader1.load_VCF_batch_paralel()
         del loader1
-        toc = time.perf_counter()
-            
         with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f:
             pickle.dump(exclusive_cell_variants, f)
-        print(f"Loading took {toc - tic:0.4f} seconds")
 
         print('---Loading expected VCF----')
         loader3 = VCF_Loader(expected_vcf, biallelic_only=True,
@@ -679,13 +816,15 @@ def get_options():
         donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants)
         with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f:
             pickle.dump(donor_distinct_sites, f)
-        
-    print('---donor_distinct_sites calculated----')
 
-
-    conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites)
-    cell_concordance_table = conc1.conc_table()
-    
+    cell_concordance_table = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites, informative_sites, uninformative_sites).conc_table()
     result = pd.DataFrame(cell_concordance_table).T
+    try:
+        site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']]
+        result.drop(columns=['Concordant_Site_Identities'],inplace=True)
+        site_identities.to_csv(f"site_identities_{outfile}",sep='\t')
+    except:
+        _='sample_hasnt_matched_any_gt --- most likely too little cells assigned'
     result.to_csv(outfile,sep='\t')
+    
     print('Processing Done')
\ No newline at end of file

From 4377f68236a15f2e823e706cf8f628b6bb3d9cd1 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Tue, 14 Nov 2023 18:57:00 +0000
Subject: [PATCH 3/7] all done, lets prduce the metrics

---
 bin/concordance_calculations.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py
index 8d0c41f2..c1b0b309 100644
--- a/bin/concordance_calculations.py
+++ b/bin/concordance_calculations.py
@@ -12,7 +12,7 @@
 #   list of donors in the pool, how many of the discordant sites are found in the donor, cohort each belongs to
 #   list of discordant sites
 
-__date__ = '2023-14-11'
+__date__ = '2023-07-24'
 __version__ = '0.0.1'
 import argparse
 import sys
@@ -532,17 +532,17 @@ def set_results(self,to_set,id):
             pickle.dump(to_set, f)
         self.record_dict[id]=f'tmp_{id}.pkl' 
     
-    def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm):
+    def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table):
         donor_concordance_table = {}
         other_donor_concordance_table = []
         for cell1 in Cells_to_keep_pre:
             count+=1
 
             cell_vars = exclusive_cell_variants[cell1]
-            result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data)
+            result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table)
             cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table)
-            if count>300:
-                break
+            # if count>300:
+            #     break
             # here we should write these independently to the files
             if (count % 50 == 0):
                 self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}")
@@ -600,6 +600,7 @@ def conc_table(self):
             expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
             all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
 
+        
         for i,row1 in donor_assignments_table.iterrows():
             donor_in_question = row1['donor_query']
             donor_gt_match = row1['donor_gt']
@@ -620,10 +621,10 @@ def conc_table(self):
             except:
                 continue
             if cpus==1:
-                result = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm)
+                result = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table)
                 self.combine_concordances(result)
             else:
-                pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances)          
+                pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table]),callback=self.combine_concordances)          
                 
         pool.close()
         pool.join()
@@ -633,7 +634,7 @@ def conc_table(self):
         return self.cell_concordance_table
 
     
-    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data):
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table):
 
         Concordant_Sites, \
             Discordant_sites, \
@@ -677,7 +678,7 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
         total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
         informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
         total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-        for donor in vars_per_donor_gt:
+        for donor in set(donor_assignments_table['donor_gt']):
           
             expected_vars_norm_of_other_donor = all_donor_data[donor]
             
@@ -769,9 +770,9 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
                                                 'total_sites_otherDonor':total_sites_otherDonor, \
                                                 'discordant_reads_otherDonor':discordant_reads_otherDonor, \
                                                 'total_reads_otherDonor':total_reads_otherDonor, \
-                                                'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \
-                                                'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
-                                                'concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
+                                                # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \
+                                                # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
+                                                'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
                                                 })
                 
         discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)

From aa30bb8b9881c57a0b82b533dee933bf7bd2e771 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Thu, 16 Nov 2023 16:20:28 +0000
Subject: [PATCH 4/7] cross cohort contamination

---
 bin/concordance_calculations.py | 138 +++++++++++++++++++++++++-------
 1 file changed, 111 insertions(+), 27 deletions(-)

diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py
index c1b0b309..1aa49ac8 100644
--- a/bin/concordance_calculations.py
+++ b/bin/concordance_calculations.py
@@ -308,7 +308,7 @@ def get_discordance(self,expected_vars2,cell_vars2):
         return Concordant_Sites,Discordant_sites,disc_sites
     
     
-    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars):
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False):
         # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
         # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
         # Author: M.Ozols
@@ -443,12 +443,12 @@ def append_results_cell_concordances(self,result,cell_concordance_table,other_do
                                                                 'GT 2':result['donor_gt_match'],
                                                                 'cohort': cohort,
                                                                 
-                                                                'Nr_Concordant':result['Nr_Concordant'],
-                                                                'Nr_Discordant':result['Nr_Discordant'],
+                                                                # 'Nr_Concordant':result['Nr_Concordant'],
+                                                                # 'Nr_Discordant':result['Nr_Discordant'],
                                                                 'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'],
                                                                 'Nr_strict_discordant':result['true_discordant_count'],
-                                                                'Percent Concordant':percent_concordant,
-                                                                'Percent Discordant':percent_discordant,
+                                                                # 'Percent Concordant':percent_concordant,
+                                                                # 'Percent Discordant':percent_discordant,
                                                                 'Percent_relaxed_concordant': percent_relaxed_concordant,
                                                                 'Percent_strict_discordant': percent_strict_discordant,
                                                                 'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']),
@@ -468,7 +468,6 @@ def append_results_cell_concordances(self,result,cell_concordance_table,other_do
                                                                 'Discordant_reads_informtive': result['discordant_reads_informative'],
                                                                 'Discordant_reads_uninformtive': result['discordant_reads_uninformative'],
                                                                 'Discordant_reads_by_n_sites': read_discordance,
-                                                                
                                                                 'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']),
                                                                 'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'],
                                                                 'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'],
@@ -481,7 +480,10 @@ def append_results_cell_concordances(self,result,cell_concordance_table,other_do
                                                                 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
                                                                 'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], 
                                                                 'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], 
-                                                                'Discordant_Site_Identities':result['discordant_sites'],
+                                                                'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'],
+                                                                'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], 
+                                                                'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], 
+                                                                'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort']
                                                                 } 
         
         return [cell_concordance_table,other_donor_concordance_table]
@@ -634,6 +636,21 @@ def conc_table(self):
         return self.cell_concordance_table
 
     
+
+    def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm):
+        # we need this function wrapper to calculate the concordant, discordant read 
+        # counts for each of the discordant sites that are concordant with another donor.
+        
+        Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
+        expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+        cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]        
+        cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
+        cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
+        cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
+        total_reads,_,_,discordant_reads = self.read_concordance_calc(expected_vars2,cell_vars2)
+        concordant_reads = total_reads - discordant_reads
+        return total_reads,discordant_reads,concordant_reads
+    
     def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table):
 
         Concordant_Sites, \
@@ -676,11 +693,23 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
         discordant_vars_in_pool = []
         donor_table_of_concordances = []
         total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
+        
         informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
         total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        donor_gt_match_cohort = donor_cohorts[donor_gt_match]
+        donors_contributing_to_out_of_cohort= []
+        donors_contributing_to_Whithin_Cohort=[]
+        
         for donor in set(donor_assignments_table['donor_gt']):
           
             expected_vars_norm_of_other_donor = all_donor_data[donor]
+
+            try:
+                donor_cohort = donor_cohorts[donor]
+                donor_vars = vars_per_donor_gt[donor]
+            except:
+                continue        
             
             Concordant_Sites_otherDonor, \
                 Discordant_sites_otherDonor, \
@@ -707,7 +736,7 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
                 discordant_read_fraction_in_concordant_sites_otherDonor, \
                 discordant_read_fraction_in_discordant_sites_otherDonor, \
                 discordant_reads_uninformative_fraction_otherDonor, \
-                discordant_reads_informative_fraction_otherDonor  = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars)
+                discordant_reads_informative_fraction_otherDonor  = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort)
             
             # here we also need to know :
             #   how many reads of the desired donor discordant sites could be yielded
@@ -719,28 +748,57 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
             DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor))
             Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor))
             DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor))
-
+           
             # We now count the concordant reads  that may contribute to  particular cell at this cell.
             # to do this we take the discordant sites that have been deamed to be concordant with the other donor and quantify the reads thta are concordant.
-            Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
-            expected_vars2 = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'].isin(Total_Overlapping_sites)]
-            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]        
-            cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
-            cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
-            cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
+            # Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
+            # expected_vars2 = expected_vars_norm[expected_vars_norm_of_other_donor['ids'].isin(Total_Overlapping_sites)]
+            # cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]        
+            # cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
+            # cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
+            # cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
             
-            total_reads_for_discordant_sites_that_are_concordant_with_other_donor,total_dp_for_discordant_sites_that_are_concordant_with_other_donor,total_oth_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_concordance_calc(expected_vars2,cell_vars2)
-            concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = total_reads_for_discordant_sites_that_are_concordant_with_other_donor - discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
+            # total_reads_for_discordant_sites_that_are_concordant_with_other_donor,_,_,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_concordance_calc(expected_vars2,cell_vars2)
+            # concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = total_reads_for_discordant_sites_that_are_concordant_with_other_donor - discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
             
-            try:
-                donor_cohort = donor_cohorts[donor]
-                donor_vars = vars_per_donor_gt[donor]
-            except:
-                continue            
+            total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+            # if discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor>0:
+            #     print('yes1')
+
             if not donor == donor_gt_match:
                 # We want to kow how many of these discordant site
-
+                if donor_gt_match_cohort == donor_cohort:
+                    coh = 'Whithin_Cohort'
+                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
+                        donors_contributing_to_Whithin_Cohort.append(donor)
+                else:
+                    coh = 'Out_of_Cohort'
+                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
+                        donors_contributing_to_out_of_cohort.append(donor)
+                
                 total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor))
+                # now we addit for a cohort since the biggest issue comes from cohort cross-contamination
+                # for each of these sites now we calculate the number of reads that it accounts:
+                # tree level set: cohort: site: counts
+
+                
+                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
+                    total_reads_for_site,discordant_reads_for_site,concordant_for_site = self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+                    # if discordant_reads_for_site>0:
+                    #     print('here')
+                    if concordant_for_site==0:
+                        pass
+                    try:
+                        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                    except:
+                        try:
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                        except:
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                                                    
                 # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool.
                 # the ones that havent, i have to add the reads up for them.
                 informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor))
@@ -774,8 +832,30 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
                                                 # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
                                                 'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
                                                 })
-                
-        discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)
+        
+        
+        #here now we want to see overall how many reads potentially come from different cohorts.
+        cohort_specific_site_quant_string=""
+        cohort_specific_read_quant_string=""
+        
+        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys():
+                Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1])
+        except:
+            _='Doesnt Exist'
+        
+        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys():
+                Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1])
+        except:
+            _='Doesnt Exist'
+        
+        
+        # total_reads_for_site,discordant_reads_for_site,concordant_for_site = self.read_extraction(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()),expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)   
+         
+        # discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)
         concordant_vars_in_pool_str = (";").join(concordant_vars)
         DF = pd.DataFrame(donor_table_of_concordances)
         
@@ -786,7 +866,7 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
         Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0]
         Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values)
         Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values)
-                                                                     
+                                                       
         return [{
             'cell1':cell1,
             'donor_gt_match':donor_gt_match,
@@ -823,7 +903,11 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
             'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}",
             'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}",
             'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \
-            'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites
+            'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \
+            'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \
+            'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \
+            'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \
+            'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort))
         }, donor_table_of_concordances]
     
     

From d77379c9035ddfb066c4aaa91908eaf7c7c4ff0b Mon Sep 17 00:00:00 2001
From: maxozo <mattozols@yahoo.co.uk>
Date: Fri, 17 Nov 2023 19:26:05 +0000
Subject: [PATCH 5/7] added

---
 bin/concordance_calculations.py | 50 +++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py
index 1aa49ac8..c2433e4f 100644
--- a/bin/concordance_calculations.py
+++ b/bin/concordance_calculations.py
@@ -702,9 +702,10 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
         donors_contributing_to_Whithin_Cohort=[]
         
         for donor in set(donor_assignments_table['donor_gt']):
-          
-            expected_vars_norm_of_other_donor = all_donor_data[donor]
-
+            try:
+                expected_vars_norm_of_other_donor = all_donor_data[donor]
+            except:
+                continue
             try:
                 donor_cohort = donor_cohorts[donor]
                 donor_vars = vars_per_donor_gt[donor]
@@ -767,6 +768,11 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
 
             if not donor == donor_gt_match:
                 # We want to kow how many of these discordant site
+                if 'U937' in donor:
+                    continue
+                if 'THP1' in donor:
+                    continue
+                
                 if donor_gt_match_cohort == donor_cohort:
                     coh = 'Whithin_Cohort'
                     if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
@@ -839,7 +845,10 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
         cohort_specific_read_quant_string=""
         
         Whithin_Cohort__total_number_of_potential_contaminent_reads=0
+        Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
+        Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
         try:
+            Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
             for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys():
                 Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1])
         except:
@@ -847,12 +856,30 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
         
         Out_of_Cohort__total_number_of_potential_contaminent_reads=0
         try:
+            Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
             for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys():
                 Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1])
         except:
             _='Doesnt Exist'
         
         
+        try:
+            Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
+            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) 
+        except:
+            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+            Out_of_Cohort__sites = set()
+
+        Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_reads_for_site,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm)
+            
+        try:
+            Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
+            Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
+        except:
+            Whithin_Cohort__sites = set()
+            Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+          
+        Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites)
         # total_reads_for_site,discordant_reads_for_site,concordant_for_site = self.read_extraction(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()),expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)   
          
         # discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool)
@@ -907,7 +934,14 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g
             'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \
             'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \
             'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \
-            'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort))
+            'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \
+                 
+            'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
+            'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
+            'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
+            'Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
+            'Total_Reads_for_Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool, \
+            'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool)
         }, donor_table_of_concordances]
     
     
@@ -1294,12 +1328,18 @@ def donor_exclusive_sites(exclusive_don_variants2):
     
 
     result = pd.DataFrame(cell_concordance_table).T
+
     try:
         site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']]
-        result.drop(columns=['Concordant_Site_Identities','Discordant_Site_Identities'],inplace=True)
+        result.drop(columns=['Concordant_Site_Identities'],inplace=True)
         site_identities.to_csv(f"site_identities_{outfile}",sep='\t')
     except:
         _='sample_hasnt_matched_any_gt --- most likely too little cells assigned'
+    try:
+        result.drop(columns=['Discordant_Site_Identities'],inplace=True)
+    except:
+        _='sample_hasnt_matched_any_gt --- most likely too little cells assigned'
+        
     result.to_csv(outfile,sep='\t')
     
     print('Processing Done')
\ No newline at end of file

From 1961a5ae82242666161806022e339d12bb618330 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Tue, 28 Nov 2023 15:48:24 +0000
Subject: [PATCH 6/7] harriets changes

---
 assets/deploy_scripts/bsub.sh                 |   2 +-
 assets/deploy_scripts/bsub__removeWork.sh     |   2 +-
 assets/deploy_scripts/bsub_test.sh            |   2 +-
 assets/deploy_scripts/bsub_test_celltypes.sh  |   2 +-
 assets/deploy_scripts/bsub_test_recluster.sh  |  29 +
 .../input_setups/recluster_profile.nf         | 138 +++++
 .../nohup_start_nextflow_lsf.sh               |   2 +-
 .../nohup_start_nextflow_lsf__removeWork.sh   |   2 +-
 .../nohup_start_nextflow_lsf_celltypes.sh     |   2 +-
 .../nohup_start_nextflow_lsf_recluster.sh     |  27 +
 .../nohup_start_nextflow_lsf_test.sh          |   2 +-
 bin/0026-plot_filtered_cells.py               |  17 +-
 bin/0028-plot_predicted_sex.py                |   5 +-
 bin/0030-estimate_pca_elbow.py                |   5 +-
 bin/0035-scanpy_normalize_pca.py              | 143 +----
 ...canpy_cluster_validate_resolution-keras.py |   3 +-
 bin/pca_anndata.py                            | 556 ++++++++++++++++++
 conf/base.conf                                |  15 +-
 main.nf                                       |  13 +-
 .../nf-core/modules/clustering/functions.nf   |   2 +-
 modules/nf-core/modules/clustering/main.nf    |  82 ++-
 .../modules/estimate_pca_elbow/main.nf        |   3 -
 .../nf-core/modules/normalise_and_pca/main.nf |  95 +--
 subworkflows/qc.nf                            |  84 ++-
 workflows/yascp.nf                            |  21 +-
 25 files changed, 954 insertions(+), 300 deletions(-)
 create mode 100755 assets/deploy_scripts/bsub_test_recluster.sh
 create mode 100644 assets/deploy_scripts/input_setups/recluster_profile.nf
 create mode 100755 assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh
 create mode 100755 bin/pca_anndata.py

diff --git a/assets/deploy_scripts/bsub.sh b/assets/deploy_scripts/bsub.sh
index bcbe9189..0d9012b8 100755
--- a/assets/deploy_scripts/bsub.sh
+++ b/assets/deploy_scripts/bsub.sh
@@ -21,5 +21,5 @@ if ["$varname" = ''];
 fi
 sample="$RUN_ID"
 echo -e "\n Submitting yascp (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE"
-bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE
+bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J $sample"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub__removeWork.sh b/assets/deploy_scripts/bsub__removeWork.sh
index 1f2e5dfa..f1bffef1 100755
--- a/assets/deploy_scripts/bsub__removeWork.sh
+++ b/assets/deploy_scripts/bsub__removeWork.sh
@@ -5,5 +5,5 @@ INPUT_FILE=$1
 export RUN_ID="${PWD##*/}"
 sample="$RUN_ID.yascp"
 echo "Cleaning the work directory (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE by using '-entry WORK_DIR_REMOVAL --remove_work_dir' "
-bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE
+bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J $sample"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub_test.sh b/assets/deploy_scripts/bsub_test.sh
index 8a163fff..52b474dd 100755
--- a/assets/deploy_scripts/bsub_test.sh
+++ b/assets/deploy_scripts/bsub_test.sh
@@ -25,5 +25,5 @@ fi
 
 sample="$RUN_ID.yascp"
 echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in test mode withsample OneK1k dataset"
-bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
 echo "Submitted job can be killed with: bkill -J yascp_test"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub_test_celltypes.sh b/assets/deploy_scripts/bsub_test_celltypes.sh
index 3c6dc200..7a12a9ac 100755
--- a/assets/deploy_scripts/bsub_test_celltypes.sh
+++ b/assets/deploy_scripts/bsub_test_celltypes.sh
@@ -25,5 +25,5 @@ fi
 
 sample="$RUN_ID.yascp"
 echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_CELLTYPES mode with input file $INPUT_FILE"
-bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh  $INPUT_FILE
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh  $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J yascp_celltypes"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub_test_recluster.sh b/assets/deploy_scripts/bsub_test_recluster.sh
new file mode 100755
index 00000000..7c8d6b97
--- /dev/null
+++ b/assets/deploy_scripts/bsub_test_recluster.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+CWD1="$PWD"
+parentdir="$(dirname "$CWD1")"
+INPUT_FILE=$1
+export RUN_ID="${PWD##*/}"
+
+# export SINGULARITY_CACHEDIR='/software/hgi/containers/yascp'
+
+export NXF_OPTS="-Xms5G -Xmx5G"
+export SINGULARITY_TMPDIR=$PWD/work/tmp
+export TEMP=$PWD/work/tmp
+export TMP_DIR=$PWD/work/tmp
+
+echo press ENTER to NOT fetch containers, otherwise provide writable path:
+read varname
+
+if ["$varname" = ''];
+    then
+        export NXF_SINGULARITY_CACHEDIR='/software/hgi/containers/yascp'
+        export SINGULARITY_DISABLE_CACHE=0
+    else
+        echo Yascp Will fetch the containers and place them in $varname
+        export NXF_SINGULARITY_CACHEDIR=$varname
+fi
+
+sample="$RUN_ID.yascp"
+echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_RECLUSTER mode with input file $INPUT_FILE"
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_cluster -n 1 -M 4000 -o yascp_cluster.o -e yascp_cluster.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh  $INPUT_FILE
+echo "Submitted job can be killed with: bkill -J yascp_cluster"
\ No newline at end of file
diff --git a/assets/deploy_scripts/input_setups/recluster_profile.nf b/assets/deploy_scripts/input_setups/recluster_profile.nf
new file mode 100644
index 00000000..cd84a1b0
--- /dev/null
+++ b/assets/deploy_scripts/input_setups/recluster_profile.nf
@@ -0,0 +1,138 @@
+params {
+
+    lisi{
+        run_process=true
+    }
+    replace_genotype_ids=false
+    write_h5=true
+    cluster_validate_resolution_keras = true
+    // run_celltype_assignment = true
+    project_name = 'T_Cell_Bio_Response'
+    filter_outliers = false
+    extra_sample_metadata =""
+    output_dir = outdir= "${launchDir}/recluster_resolutions"
+    cellex_cluster_markers=true 
+    cluster_markers = false
+    normalise_andata = false
+    skip_handover = true
+    // output_dir = outdir= "${launchDir}/results"
+    // run_celltype_assignment=true
+    split_ad_per_bach=true //if not splitting the celltype assignment will be run on full tranche
+    // input_data_table = "$outdir/handover/Summary_plots/$RUN_ID/Fetch Pipeline/Input/input_table.tsv"
+    // cellbender_location="${output_dir}/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting  input = 'existing_cellbender' instead input = 'cellbender'
+    // existing_cellsnp="${output_dir}/cellsnp"
+    cellbender_location="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results_11_09_2023/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting  input = 'existing_cellbender' instead input = 'cellbender'
+    existing_cellsnp="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results/cellsnp"
+
+    skip_preprocessing = true
+    // file__anndata_merged = '/lustre/scratch126/humgen/projects/sc-eqtl-ibd/analysis/harriet_analysis/230313_hb58_yascp_analysis/231114_h5ad_files_for_MCC/231120_TCs_only_regressed_counts_HVGs.h5ad'
+
+    harmony{
+        run_process= true
+    }
+    umap{
+        run_process = true
+        colors_quantitative{
+            description = 'Comma separated string of quantitative variables that will be used to color points.'
+            value = 'n_cells,total_counts,pct_counts_gene_group__mito_transcript,prob_doublet,pct_counts_gene_group__ribo_rna,Azimuth:predicted.celltype.l2.score,Azimuth:mapping.score,log10_ngenes_by_count'
+        }
+        colors_categorical{
+            description = 'Comma separated string of categorical variables that will be used to color points.'
+            value = 'cell_passes_qc,cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2,experiment_id,Azimuth:predicted.celltype.l2,Celltypist:Immune_All_Low:predicted_labels,Celltypist:Immune_All_High:predicted_labels,donor_id'
+        }
+    }
+
+    mads_categories ='pct_counts_gene_group__mito_transcript,pct_counts_gene_group__mito_protein,pct_counts_gene_group__ribo_protein,pct_counts_gene_group__ribo_rna,total_counts,n_genes_by_counts,log10_ngenes_by_count'
+    // hard_filters_file       = "${projectDir}/../sample_qc.yml"
+    // hard_filters_drop = false //#This indicates whether we want to drop the cells that fail hard filters of just flag them
+
+    cluster{
+        description = """Parameters for clustering. All pairwise combinations of
+        method and resolution will be performed."""
+        number_neighbors{
+            description = """Number of neighbors. If <= 0, uses number of unique
+            experiment_id."""
+            value = 15
+        }
+        methods{
+            description = 'Clustering method. Valid options [leiden|louvain].'
+            value = 'leiden'
+        }
+        resolutions{
+            description = 'Clustering resolution.'
+            value = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
+        }
+
+        variables_boxplot{
+            decription = 'Generate boxplots of these variables for each cluster.'
+            value ='n_cells,total_counts,pct_counts_gene_group__mito_transcript'
+        }
+
+        known_markers{
+            run_process = false
+            description = """Files with markers that will be used to generate
+            dotplots. Each marker file should be the full path and have the
+            following columns: cell_type, hgnc_symbol. The following columns
+            are optional: p_value_adj. Use "" for a single entry in the
+            file_id and file value to indicate no plots."""
+            value = [
+                [ file_id: 'SmillieCS_31348891', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/SmillieCS-31348891/database.tsv' ],
+                [ file_id: 'ParikhK_30814735', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/ParikhK-30814735/database.tsv' ],
+                [ file_id: 'JamesKR_32066951', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon-immune/JamesKR-32066951/database.tsv' ]
+            ]
+        }
+
+
+
+
+    }  
+    bbknn{
+        run_process = true
+    }
+
+    celltype_assignment{
+        run_celltype_assignment=false
+        run_azimuth=true
+        run_keras=false
+        run_celltypist=true
+    }
+    reduced_dims{
+        vars_to_regress{
+            value = ''   
+        }
+    }
+
+}
+
+process {
+
+    withName: plot_distributions{
+        containerOptions = "--containall --cleanenv --workdir /tmp -B /tmp"
+    }
+
+    withName: cellex_cluster_markers{
+        maxForks=7
+        memory = 300.GB
+    }
+    
+    withName: GATHER_DATA{
+        maxForks=7
+        memory = 100.GB
+    }
+    withName: LISI{
+        maxForks=7
+        memory = 300.GB
+    }
+    withName: cluster_validate_resolution_keras{
+        memory = 300.GB
+    }
+
+    withName: umap_calculate_and_plot{
+        memory = 300.GB
+    }
+
+    withName: sccaf_assess_clustering{
+        memory = 300.GB
+    }
+    
+}
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
index f1dfcbc0..ee8066ab 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
@@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
index f640bbf3..28db82dc 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
@@ -21,7 +21,7 @@ export RUN_ID="${PWD##*/}"
 # export TEMP=$PWD/tmp
 # export TMP_DIR=$PWD/tmp
 
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger  -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger  -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh
index 800475d7..295cf5c7 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh
@@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh
new file mode 100755
index 00000000..995e07e5
--- /dev/null
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+INPUT_FILE=$1
+dt=`date +"%Y_%m_%d_%T"`
+cp nextflow.nohup.log ./nextflow.nohup_$dt.log2 || echo 'first time running'
+# activate Nextflow conda env
+
+# clean up previous run files
+rm -f *.log
+rm -f nextflow.nohup.PID.txt 
+
+# start Nextflow in background:
+export NXF_OPTS="-Xms5G -Xmx5G"
+
+CWD1="$PWD"
+parentdir="$(dirname "$CWD1")"
+# export RUN_ID="${parentdir##*/}"
+export RUN_ID="${PWD##*/}"
+mkdir $PWD/work || echo 'exists'
+mkdir $PWD/work/tmp || echo 'exists'
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_RECLUSTER -c /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/input_setups/recluster_profile.nf -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+
+# get process PID 
+sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
+echo $PID > nextflow.nohup.PID.txt
+echo "Nextflow PID is $PID (saved in ./nextflow.nohup.PID.txt)" 
+echo kill with \"kill $PID\"
+echo "check logs files nextflow.nohup.log and .nextflow.log"
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
index 6a8e1946..cc5fd45d 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
@@ -16,7 +16,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/bin/0026-plot_filtered_cells.py b/bin/0026-plot_filtered_cells.py
index 1752b769..7295070b 100755
--- a/bin/0026-plot_filtered_cells.py
+++ b/bin/0026-plot_filtered_cells.py
@@ -67,13 +67,16 @@ def main():
     # Check if any difference between before and after filters.	If not,
     # return early.
     df_after_filters = df[df.filter_type.isin(['after_filters'])]
-    filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
-        df_after_filters.experiment_id,
-        'n_cells_left_in_adata'
-    ].values
-    if all(filt):
-        print("No difference detected before and after filters. No plots.")
-        return()
+    try:
+        filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
+            df_after_filters.experiment_id,
+            'n_cells_left_in_adata'
+        ].values
+        if all(filt):
+            print("No difference detected before and after filters. No plots.")
+            return()
+    except:
+            return()
 
     # Set some plotting parameters
     plt_height = 16  # 1.5 * df.experiment_id.nunique()
diff --git a/bin/0028-plot_predicted_sex.py b/bin/0028-plot_predicted_sex.py
index 700e98d3..fad7fab6 100755
--- a/bin/0028-plot_predicted_sex.py
+++ b/bin/0028-plot_predicted_sex.py
@@ -60,7 +60,10 @@ def main():
 
     # Load the AnnData file
     adata = sc.read_h5ad(filename=options.h5)
-
+    try:
+        adata.X=adata.layers['counts']
+    except:
+        _='counts may be already set'
     # If we have a flag for cells that pass QC then filter down to them
     if 'cell_passes_qc' in adata.obs:
         adata = adata[adata.obs['cell_passes_qc'], :]
diff --git a/bin/0030-estimate_pca_elbow.py b/bin/0030-estimate_pca_elbow.py
index 75c1e490..0a952160 100755
--- a/bin/0030-estimate_pca_elbow.py
+++ b/bin/0030-estimate_pca_elbow.py
@@ -78,7 +78,10 @@ def main():
 
     # Read in the dataframe
     adata = sc.read_h5ad(filename=options.h5)
-
+    try:
+        adata.X=adata.layers['counts']
+    except:
+        _='counts may be already set'
     kneedle_dict = {}
     output_dict = {}
 
diff --git a/bin/0035-scanpy_normalize_pca.py b/bin/0035-scanpy_normalize_pca.py
index 99747ec5..93c6ba67 100755
--- a/bin/0035-scanpy_normalize_pca.py
+++ b/bin/0035-scanpy_normalize_pca.py
@@ -372,7 +372,6 @@ def scanpy_normalize_and_pca(
     sc.pp.filter_genes(adata, min_cells=5)
     # Only consider genes expressed in more than 0.5% of cells:
     # sc.pp.filter_genes(adata, min_cells=0.005*len(adata.obs.index))
-
     # Total-count normalize (library-size correct) the data matrix X to
     # counts per million, so that counts become comparable among cells.
     sc.pp.normalize_total(
@@ -385,26 +384,8 @@ def scanpy_normalize_and_pca(
     # Logarithmize the data: X = log(X + 1) where log = natural logorithm.
     # Numpy has a nice function to undo this np.expm1(adata.X).
     sc.pp.log1p(adata)
-    # Delete automatically added uns - UPDATE: bad idea to delete as this slot
-    # is used in _highly_variable_genes_single_batch.
-    # del adata.uns['log1p']
-    # Add record of this operation.
-    # adata.layers['log1p_cpm'] = adata.X.copy()
-    # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'}
     adata.layers['log1p_cp10k'] = adata.X.copy()
     adata.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'}
-
-    # Stash the unprocessed data in the raw slot.
-    # adata.raw.X.data is now ln(CPM+1).
-    # NOTE: - Layers are not preserved in adata.raw, though obs, var, uns are.
-    #       - If genes are filtered (e.g.,
-    #         sc.pp.filter_genes(adata, min_cells=1)), the full dataset will
-    #         remain in the raw slot.
-    #       - We store in the raw slot because later for UMAP and marker gene
-    #         analysis, we can easily tell scanpy to use the raw slot via the
-    #         use_raw = True flag. Raw was specifically designed for this use
-    #         case of ln(CPM+1),
-    # Can be deleted later: del adata.raw
     adata.raw = adata
     # adata_raw = adata.raw.to_adata()
 
@@ -433,32 +414,7 @@ def scanpy_normalize_and_pca(
         batch_key=variable_feature_batch_key,
         inplace=True
     )
-    if verbose:
-        print('{}: {} (all batches); {} ({})'.format(
-            'Number of variable features detected',
-            adata.var['highly_variable_intersection'].sum(),
-            adata.var['highly_variable'].sum(),
-            'after ranking the number of batches where a feature is variable'
-        ))
-    # If n_top_genes = None, then one needs to set 'highly_variable'.
-    # Here, highly_variable_intersection is only true for genes variable across
-    # all batch keys (i.e., 'highly_variable_nbatches' = n_batch_keys):
-    # adata.var.loc[
-    #     adata.var["highly_variable_intersection"],
-    #     ["highly_variable_nbatches"]
-    # ]
-    #
-    # If n_top_genes = None, then one also needs needs to set highly_variable'.
-    # Fix bug in PCA when we have set batch_key. More below:
-    # https://github.com/theislab/scanpy/issues/1032
-    # adata.var['highly_variable'] = adata.var['highly_variable_intersection']
-    #
-    # Alternatively, if one specifies n_top_genes, then genes are ranked by
-    # 'highly_variable_nbatches' and highly_variable is set to the top n.
-    # adata.var.loc[
-    #     adata.var["highly_variable"],
-    #     ["highly_variable_nbatches"]
-    # ]
+
 
     if plot:
         # Plot highly variable genes.
@@ -609,106 +565,15 @@ def scanpy_normalize_and_pca(
             copy=False
         )
 
+
     # Keep a record of the different gene scores
     if score_genes_df is not None:
         adata.uns['df_score_genes'] = score_genes_df_updated
 
-    # Calculate PCs.
-
-    seed_value = 0
-    # 0. Set `PYTHONHASHSEED` environment variable at a fixed value
-    os.environ['PYTHONHASHSEED'] = str(seed_value)
-    # 1. Set `python` built-in pseudo-random generator at a fixed value
-    random.seed(seed_value)
-    # 2. Set `numpy` pseudo-random generator at a fixed value
-    np.random.seed(seed_value)
-
-    sc.tl.pca(
-        adata,
-        n_comps=min(200, adata.var['highly_variable'].sum()),
-        zero_center=True,  # Set to true for standard PCA
-        svd_solver='arpack',  # arpack reproducible when zero_center = True
-        use_highly_variable=True,
-        copy=False,
-        random_state=np.random.RandomState(0),
-        chunked=False
-    )
-    # pca(
-    #     adata,
-    #     n_comps=min(200, adata.var['highly_variable'].sum()),
-    #     svd_solver='arpack',  # lobpcg not found in current sklearn
-    #     use_highly_variable=True,
-    #     copy=False
-    # )
-
-    # Save PCs to a seperate file for Harmony.
-    pca_df = pd.DataFrame(
-        adata.obsm['X_pca'],
-        index=adata.obs_names,
-        columns=[
-            'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1)
-        ]
-    )
-    pca_df.to_csv(
-        '{}-pcs.tsv.gz'.format(output_file),
-        sep='\t',
-        index=True,
-        index_label='cell_barcode',
-        na_rep='',
-        compression=compression_opts
-    )
-
-    # Save the metadata to a seperate file for Harmony.
-    adata.obs.to_csv(
-        '{}-metadata.tsv.gz'.format(output_file),
-        sep='\t',
-        index=True,
-        quoting=csv.QUOTE_NONNUMERIC,
-        index_label='cell_barcode',
-        na_rep='',
-        compression=compression_opts
-    )
-
     # Save the data.
     adata.write(
-        '{}-normalized_pca.h5ad'.format(output_file),
+        '{}-normalized.h5ad'.format(output_file),
         compression='gzip'
-        #compression_opts=anndata_compression_opts
-    )
-    # adata_merged.write_csvs(output_file)
-    # adata_merged.write_loom(output_file+".loom"))
-
-    # Plot the PC info.
-    if plot:
-        # Plot the vanilla PCs.
-        # sc.pl.pca(
-        #     adata,
-        #     color='experiment_id',
-        #     components=['1,2', '3,4']
-        # )
-        _ = sc.pl.pca_variance_ratio(
-            adata,
-            n_pcs=adata.obsm['X_pca'].shape[1],
-            log=False,
-            show=False,
-            save='-{}.pdf'.format(output_file)
-        )
-        _ = sc.pl.pca_variance_ratio(
-            adata,
-            n_pcs=adata.obsm['X_pca'].shape[1],
-            log=True,
-            show=False,
-            save='-{}-log.pdf'.format(output_file)
-        )
-
-    # Save the filtered count matrix for input to other software like scVI
-    adata.X = adata.layers['counts']
-    del adata.layers['counts']
-    del adata.raw
-    adata.write(
-        '{}-normalized_pca-counts.h5ad'.format(output_file),
-        compression='gzip'
-        #compression_opts=anndata_compression_opts
     )
 
     return(output_file)
@@ -852,7 +717,7 @@ def main():
     drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering
     # Load the AnnData file
     adata = sc.read_h5ad(filename=options.h5)
-
+    # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad')
     # if this is the subclustering analysis, the count matrix should be used
     # by default, the analysis is "conventional" and thus will be skipped
     if options.layer != "none":
diff --git a/bin/0057-scanpy_cluster_validate_resolution-keras.py b/bin/0057-scanpy_cluster_validate_resolution-keras.py
index 0dafeafa..f5579e9a 100755
--- a/bin/0057-scanpy_cluster_validate_resolution-keras.py
+++ b/bin/0057-scanpy_cluster_validate_resolution-keras.py
@@ -519,7 +519,8 @@ def main():
             # Virtual devices must be set before GPUs have been initialized
             print(e)
     else:
-        raise Exception('ERROR: no GPUs detected.')
+        _ = 'running without gpus'
+        # raise Exception('ERROR: no GPUs detected.')
 
     # Get additional data we are going to append to the output model info
     dict_add = {}
diff --git a/bin/pca_anndata.py b/bin/pca_anndata.py
new file mode 100755
index 00000000..591c9ad4
--- /dev/null
+++ b/bin/pca_anndata.py
@@ -0,0 +1,556 @@
+#!/usr/bin/env python
+
+
+__date__ = '2020-03-13'
+__version__ = '0.0.1'
+
+import argparse
+from distutils.version import LooseVersion
+import os
+os.environ['NUMBA_CACHE_DIR']='/tmp'
+os.environ['MPLCONFIGDIR']='/tmp'
+import random
+import numpy as np
+import scipy as sp
+# import sklearn.utils
+import sklearn.decomposition
+import pandas as pd
+import scanpy as sc
+import csv
+import time
+from datetime import timedelta
+
+# Set seed for reproducibility
+seed_value = 0
+# 0. Set `PYTHONHASHSEED` environment variable at a fixed value
+os.environ['PYTHONHASHSEED'] = str(seed_value)
+# 1. Set `python` built-in pseudo-random generator at a fixed value
+random.seed(seed_value)
+# 2. Set `numpy` pseudo-random generator at a fixed value
+np.random.seed(seed_value)
+
+# Set scanpy settings
+# sc verbosity: errors (0), warnings (1), info (2), hints (3)
+# sc.settings.verbosity = 3
+# sc.logging.print_versions()
+# sc.settings.set_figure_params(dpi=80)
+
+
+def pca(
+    data,
+    n_comps=None,
+    svd_solver='arpack',
+    use_highly_variable=None,
+    copy=False
+):
+    """Compute PCA coordinates, loadings and variance decomposition.
+
+    Derived from scanpy 1.5.1.
+    Principal component analysis [Pedregosa11]_.]
+    Uses the implementation of *scikit-learn* [Pedregosa11]_.
+
+    Parameters
+    ----------
+    data
+        The (annotated) data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
+    n_comps
+        Number of principal components to compute. Defaults to 50, or 1 -
+        minimum dimension size of selected representation.
+    svd_solver
+        SVD solver to use:
+        `'arpack'` (the default)
+          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)
+        `'randomized'`
+          for the randomized algorithm due to Halko (2009).
+        `'auto'`
+          chooses automatically depending on the size of the problem.
+        `'lobpcg'`
+          An alternative SciPy solver.
+        .. versionchanged:: 1.4.5
+           Default value changed from `'auto'` to `'arpack'`.
+        Efficient computation of the principal components of a sparse matrix
+        currently only works with the `'arpack`' or `'lobpcg'` solvers.
+    use_highly_variable
+        Whether to use highly variable genes only, stored in
+        `.var['highly_variable']`.
+        By default uses them if they have been determined beforehand.
+    copy
+        If an :class:`~anndata.AnnData` is passed, determines whether a copy
+        is returned. Is ignored otherwise.
+    Returns
+    -------
+    adata : anndata.AnnData
+        …otherwise if `copy=True` it returns or else adds fields to `adata`:
+        `.obsm['X_pca']`
+             PCA representation of data.
+        `.varm['PCs']`
+             The principal components containing the loadings.
+        `.uns['pca']['variance_ratio']`
+             Ratio of explained variance.
+        `.uns['pca']['variance']`
+             Explained variance, equivalent to the eigenvalues of the
+             covariance matrix.
+    """
+    adata = data.copy() if copy else data
+
+    if use_highly_variable and 'highly_variable' not in adata.var.keys():
+        raise ValueError(
+            'Did not find adata.var[\'highly_variable\']. '
+            'Either your data already only consists of highly-variable genes '
+            'or consider running `pp.highly_variable_genes` first.'
+        )
+    if use_highly_variable is None:
+        if 'highly_variable' in adata.var.keys():
+            use_highly_variable = True
+        else:
+            use_highly_variable = False
+
+    if use_highly_variable:
+        adata_comp = (
+            adata[:, adata.var['highly_variable']]
+        )
+    else:
+        adata_comp = adata
+
+    if n_comps is None:
+        min_dim = min(adata_comp.n_vars, adata_comp.n_obs)
+        n_comps = min_dim - 1
+
+    # random_state = sklearn.utils.check_random_state(random_state)
+    X = adata_comp.X
+
+    # If sparse, make dense.
+    # Another option:
+    # output = _pca_with_sparse(
+    #     X, n_comps, solver=svd_solver, random_state=random_state
+    # )
+    if sp.sparse.issparse(X):
+        X = X.toarray()
+
+    # Sort out the solver
+    if svd_solver == 'auto':
+        svd_solver = 'arpack'
+    if svd_solver not in {'arpack', 'randomized'}:
+        raise ValueError(
+            'svd_solver: {svd_solver} can not be used with sparse input.'
+        )
+
+    pca_ = sklearn.decomposition.PCA(
+        n_components=n_comps,
+        svd_solver=svd_solver,
+        random_state=0
+    )
+    X_pca = pca_.fit_transform(X)
+
+    # Cast to whatever datatype.
+    # dtype = 'float32'
+    # dtype
+    #     Numpy data type string to which to convert the result.
+    # if X_pca.dtype.descr != np.dtype(dtype).descr:
+    #     X_pca = X_pca.astype(dtype)
+
+    # Update the adata frame (if copy=False, then this is the same input adata
+    # that the user provided)
+    adata.obsm['X_pca'] = X_pca
+    adata.uns['pca'] = {}
+    adata.uns['pca']['params'] = {
+        'zero_center': True,
+        'use_highly_variable': use_highly_variable,
+    }
+    if use_highly_variable:
+        adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
+        adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T
+    else:
+        adata.varm['PCs'] = pca_.components_.T
+    adata.uns['pca']['variance'] = pca_.explained_variance_
+    adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
+
+    return adata if copy else None
+
+
+def score_cells(
+    adata,
+    score_genes_df,
+    score_genes_df_column='ensembl_gene_id',
+    only_use_variable_genes=False
+):
+    """Scores each cell.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Input AnnData object. Assume adata.X is norm->log1p->scaled data.
+    score_genes_df : pd.DataFrame
+        Dataframe of marker genes. Needs to have score_genes_df_column and
+        score_id column. If one score_id == 'cell_cycle', then requires a
+        grouping_id column with 'G2/M' and 'S'.
+    score_genes_df_column : string
+        Column in score_genes_df to use for gene ids (e.g., hgnc_symbol,
+        ensembl_gene_id)
+    only_use_variable_genes : boolean
+        Only use variable genes to calculate scores. If True, score_id will
+        be changed to <score_id>__hvg_only. Note this flage does not apply
+        to score_id == 'cell_cycle'.
+
+
+    Returns
+    -------
+    adata : AnnData
+        AnnData object with scores calculated and stored in
+        adata.obs[<score_id>].
+    score_genes_df : pd.DataFrame
+        The score_genes_df with the following columns added:
+        gene_found_in_adata, gene_found_is_highly_variable. It is suggested
+        that this dataframe is added to the adata.uns slot.
+    """
+    verbose = False  # For debugging purposes.
+
+    # Update the score_genes_df with details on the genes and if they were
+    # found in adata and if they are highly variable.
+    score_genes_df['gene_found_in_adata'] = np.in1d(
+        score_genes_df[score_genes_df_column],
+        adata.var.index
+    )
+    score_genes_df['gene_found_is_highly_variable'] = np.in1d(
+        score_genes_df[score_genes_df_column],
+        adata.var.index[adata.var['highly_variable']]
+    )
+
+    # Set the gene pool parameter.
+    gene_pool = None  # If None, all genes are randomly sampled for background
+    if only_use_variable_genes:
+        gene_pool = adata.var.index[adata.var['highly_variable']]
+
+    # Loop over each score_id in score_genes_df, updating adata.
+    for score_id, df_group in score_genes_df.groupby('score_id'):
+        # Downsample to only those genes found in the data.
+        df_group = df_group.loc[
+            df_group['gene_found_in_adata'], :
+        ]
+        if df_group.shape[0] == 0:
+            continue
+
+        # If we are supposed to use only_use_variable_genes, then do so.
+        if only_use_variable_genes:
+            if score_id == 'cell_cycle':
+                continue
+            score_id = '{}__hvg_only'.format(score_id)
+            df_group = df_group.loc[
+                df_group['gene_found_is_highly_variable'], :
+            ]
+            if df_group.shape[0] == 0:
+                continue
+        if verbose:
+            print('Scoring {}'.format(score_id))
+
+        # Set the number of control genes.
+        ctrl_size = 50
+        if df_group.shape[0] > 50:
+            ctrl_size = df_group.shape[0]
+        if gene_pool is not None:
+            if ctrl_size > len(gene_pool):
+                raise Exception(
+                    'Error in gene scoring ctrl_size > len(gene_pool)'
+                )
+
+        # If the score_id is cell_cycle, then use the specific cell cycle
+        # scoring function.
+        if score_id == 'cell_cycle':
+            # NOTE: Setting ctrl_size` is not possible, as it's set as
+            #       `min(len(s_genes), len(g2m_genes))`.
+            sc.tl.score_genes_cell_cycle(
+                adata,
+                s_genes=df_group.loc[
+                    df_group['grouping_id'] == 'S', score_genes_df_column
+                ],
+                g2m_genes=df_group.loc[
+                    df_group['grouping_id'] == 'G2/M', score_genes_df_column
+                ],
+                copy=False,
+                gene_pool=gene_pool,  # Default is None (aka, use all)
+                n_bins=25,  # Default is 25
+                use_raw=False
+            )
+        else:
+            sc.tl.score_genes(
+                adata,
+                df_group[score_genes_df_column],
+                ctrl_size=ctrl_size,  # Default is 50
+                gene_pool=gene_pool,  # Default is None (aka, use all)
+                n_bins=25,  # Default is 25
+                score_name=score_id,
+                random_state=0,  # Default is 0
+                copy=False,
+                use_raw=False
+            )
+
+    return adata, score_genes_df
+
+
+def pca_analysis(
+    adata,
+    output_file,
+    variable_feature_batch_key='experiment_id',
+    n_variable_features=2000,
+    exclude_hv_gene_df=[],
+    score_genes_df=None,
+    verbose=True,
+    plot=True,
+    anndata_compression_opts=4
+):
+
+    # Calculate PCs.
+    seed_value = 0
+    # 0. Set `PYTHONHASHSEED` environment variable at a fixed value
+    os.environ['PYTHONHASHSEED'] = str(seed_value)
+    # 1. Set `python` built-in pseudo-random generator at a fixed value
+    random.seed(seed_value)
+    # 2. Set `numpy` pseudo-random generator at a fixed value
+    np.random.seed(seed_value)
+
+    sc.tl.pca(
+        adata,
+        n_comps=min(200, adata.var['highly_variable'].sum()),
+        zero_center=True,  # Set to true for standard PCA
+        svd_solver='arpack',  # arpack reproducible when zero_center = True
+        use_highly_variable=True,
+        copy=False,
+        random_state=np.random.RandomState(0),
+        chunked=False
+    )
+
+    # Save PCs to a seperate file for Harmony.
+    pca_df = pd.DataFrame(
+        adata.obsm['X_pca'],
+        index=adata.obs_names,
+        columns=[
+            'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1)
+        ]
+    )
+    
+    compression_opts = 'gzip'
+    if LooseVersion(pd.__version__) > '1.0.0':
+        compression_opts = dict(
+            method='gzip',
+            compresslevel=9
+        )
+    
+    pca_df.to_csv(
+        '{}-pcs.tsv.gz'.format(output_file),
+        sep='\t',
+        index=True,
+        index_label='cell_barcode',
+        na_rep='',
+        compression=compression_opts
+    )
+
+    # Save the metadata to a seperate file for Harmony.
+    adata.obs.to_csv(
+        '{}-metadata.tsv.gz'.format(output_file),
+        sep='\t',
+        index=True,
+        quoting=csv.QUOTE_NONNUMERIC,
+        index_label='cell_barcode',
+        na_rep='',
+        compression=compression_opts
+    )
+
+    # Save the data.
+    adata.write(
+        '{}-normalized_pca.h5ad'.format(output_file),
+        compression='gzip'
+    )
+    # Plot the PC info.
+    if plot:
+        # Plot the vanilla PCs.
+        sc.pl.pca_variance_ratio(
+            adata,
+            n_pcs=adata.obsm['X_pca'].shape[1],
+            log=False,
+            show=False,
+            save='-{}.pdf'.format(output_file)
+        )
+        
+        sc.pl.pca_variance_ratio(
+            adata,
+            n_pcs=adata.obsm['X_pca'].shape[1],
+            log=True,
+            show=False,
+            save='-{}-log.pdf'.format(output_file)
+        )
+
+    # Save the filtered count matrix for input to other software like scVI
+    adata.X = adata.layers['counts']
+    del adata.layers['counts']
+    del adata.raw
+    adata.write(
+        '{}-normalized_pca-counts.h5ad'.format(output_file),
+        compression='gzip'
+        #compression_opts=anndata_compression_opts
+    )
+
+
+def main():
+    """Run CLI."""
+    parser = argparse.ArgumentParser(
+        description="""
+            Read anndata object. Normalize, calculate PCs. Save new anndata
+            object along with csv file of PCs.
+            """
+    )
+
+    parser.add_argument(
+        '-v', '--version',
+        action='version',
+        version='%(prog)s {version}'.format(version=__version__)
+    )
+
+    parser.add_argument(
+        '-h5', '--h5_anndata',
+        action='store',
+        dest='h5',
+        required=True,
+        help='H5 AnnData file.'
+    )
+
+    parser.add_argument(
+        '-layer', '--overwrite_x_with_layer',
+        action='store',
+        dest='layer',
+        default='none',
+        help='Specify a layer of the AnnData file, which should be used for \
+            the following normalization and downstream analysis. This should \
+            go together with the analysis mode of the pipeline as \
+            "conventional" or "subclustering". \
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-bk', '--batch_key',
+        action='store',
+        dest='bk',
+        default='experiment_id',
+        help='Batch key for highly-variable feature (e.g., gene) detection.\
+            If specified, highly-variable features are selected within each\
+            batch separately and merged.\
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-nvf', '--number_variable_features',
+        action='store',
+        dest='nvf',
+        default=2000,
+        type=int,
+        help='After calculating variable features within each batch set via\
+            <batch_key>, rank features by number of batches where they are\
+            variable and select the top <number_variable_features>.\
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-vge', '--variable_genes_exclude',
+        action='store',
+        dest='vge',
+        default='',
+        help='Tab-delimited file with genes to exclude from the highly\
+            variable gene list. Must contain ensembl_gene_id column.\
+            (default: None - keep all variable genes)'
+    )
+
+    parser.add_argument(
+        '-vr', '--vars_to_regress',
+        action='store',
+        dest='vr',
+        default='',
+        help='Comma seperated list of metadata variables to regress prior to\
+            calculating PCs. Example: gene_group__mito_transcript,n_count.\
+            (default: "" and sc.pp.regress_out is not called)'
+    )
+
+    parser.add_argument(
+        '-sg', '--score_genes',
+        action='store',
+        dest='sg',
+        default='',
+        help='Tab-delimited file of genes for scores. Needs to have\
+            ensembl_gene_id and score_id column. If one\
+            score_id == "cell_cycle", then requires a grouping_id column with\
+            "G2/M" and "S".'
+    )
+
+    parser.add_argument(
+        '-drop_cell_passes_qc_from_clustering', '--drop_cell_passes_qc_from_clusteringdrop_cell_passes_qc_from_clustering',
+        action='store',
+        dest='drop_cell_passes_qc_from_clustering',
+        default=False,
+        help='Whether we want to drop cells before clustering based on the cell_passes_qc filter established by outlier filter part of pipeline'
+    )
+
+
+    parser.add_argument(
+        '-ncpu', '--number_cpu',
+        action='store',
+        dest='ncpu',
+        default=4,
+        type=int,
+        help='Number of CPUs to use.\
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '--anndata_compression_opts',
+        action='store',
+        dest='anndata_compression_opts',
+        default=4,
+        type=int,
+        help='Compression level in anndata. A larger value decreases disk \
+            space requirements at the cost of compression time. \
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-of', '--output_file',
+        action='store',
+        dest='of',
+        default='adata-normalize_pca',
+        help='Directory and basename of output files.\
+            (default: %(default)s)'
+    )
+
+    options = parser.parse_args()
+
+    # Scanpy settings
+    sc.settings.figdir = os.getcwd()  # figure output directory to match base.
+    sc.settings.n_jobs = options.ncpu  # number CPUs
+    # sc.settings.max_memory = 500  # in Gb
+    # sc.set_figure_params(dpi_save = 300)
+    drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering
+    # Load the AnnData file
+    adata = sc.read_h5ad(filename=options.h5)
+    try:
+        del adata.uns
+    except:
+        _='still there'
+    # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad')
+        # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/91/676237d4521fd78b293a8c4e548394/adata-normalized_pca.h5ad')
+    
+    start_time = time.time()
+    
+    pca_analysis(
+        adata,
+        output_file=options.of,
+        variable_feature_batch_key=options.bk,
+        n_variable_features=options.nvf,
+        anndata_compression_opts=options.anndata_compression_opts
+    )
+    execution_summary = "Analysis execution time [{}]:\t{}".format(
+        "pca.py",
+        str(timedelta(seconds=time.time()-start_time))
+    )
+    print(execution_summary)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/conf/base.conf b/conf/base.conf
index d31eed54..cc41cee9 100755
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -36,15 +36,16 @@ params{
     mem1= 12000
     copy_mode = "rellink"
     split_bam = false
+    cluster_markers = true
     existing_cellsnp="${projectDir}/assets/existing_cellsnp"
     existing_vireo=''
-    skip_preprocessing{
-        value=false
-        gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc
-        gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype.        
-        file__anndata_merged = ''
-        file__cells_filtered = ''
-    }
+    normalise_andata = true
+    skip_preprocessing=false
+    gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc
+    gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype.        
+    file__anndata_merged = ''
+    file__cells_filtered = ''
+    id_in='experiment_id'
     genotype_phenotype_mapping_file =''
     extra_sample_metadata = ''
     use_phenotype_ids_for_gt_match = true //#if false this will keep the genotype ids, for this to be used have to set a genotype_phenotype_mapping_file to a path to csv where firs column contains genotype ids and second contains phenotype ids to replace these to.
diff --git a/main.nf b/main.nf
index c6dea7ed..138101bf 100755
--- a/main.nf
+++ b/main.nf
@@ -13,6 +13,9 @@ include { YASCP } from "$projectDir/workflows/yascp"
 include { RETRIEVE_RECOURSES;RETRIEVE_RECOURSES_TEST_DATASET } from "$projectDir/subworkflows/local/retrieve_recourses"
 include {RSYNC_RESULTS_REMOVE_WORK_DIR} from "$projectDir/modules/local/rsync_results_remove_work_dir/main"
 include {celltype} from "$projectDir/subworkflows/celltype"
+include {qc} from "$projectDir/subworkflows/qc"
+include {dummy_filtered_channel} from "$projectDir/modules/nf-core/modules/merge_samples/functions"
+
 ////// WORKFLOW: Run main nf-core/yascp analysis pipeline
 // This is the default entry point, we have others to update ceirtain parts of the results. 
 // Please go to ./workflows/yascp to see the main Yascp workflow.
@@ -46,11 +49,19 @@ workflow {
 
 
 workflow JUST_CELLTYPES{
-    file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged)
+    file__anndata_merged = Channel.from(params.file__anndata_merged)
     celltype(file__anndata_merged)
 }
 
 
+workflow JUST_RECLUSTER{
+    file__anndata_merged = Channel.from(params.file__anndata_merged)
+    gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")
+    dummy_filtered_channel(file__anndata_merged,params.id_in)
+    file__cells_filtered = dummy_filtered_channel.out.anndata_metadata
+    qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets.
+                
+}
 
 
 ////// You do not need to concern about the workflows bellow as these are Cardinal Specific and used for development
diff --git a/modules/nf-core/modules/clustering/functions.nf b/modules/nf-core/modules/clustering/functions.nf
index 5f6efa32..b54847fb 100755
--- a/modules/nf-core/modules/clustering/functions.nf
+++ b/modules/nf-core/modules/clustering/functions.nf
@@ -341,7 +341,7 @@ process cluster_validate_resolution_keras {
     // ------------------------------------------------------------------------
     //cache false        // cache results from run
     //maxForks 2         // hard to control memory usage. limit to 3 concurrent
-    label 'gpu'        // use GPU
+    label 'process_low'        // use GPU
     scratch false      // use tmp directory
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
         container "https://yascp.cog.sanger.ac.uk/public/singularity_images/wtsihgi_nf_scrna_qc_6bb6af5-2021-12-23-3270149cf265.sif"
diff --git a/modules/nf-core/modules/clustering/main.nf b/modules/nf-core/modules/clustering/main.nf
index ea4e0b2a..2c850608 100755
--- a/modules/nf-core/modules/clustering/main.nf
+++ b/modules/nf-core/modules/clustering/main.nf
@@ -71,37 +71,34 @@ workflow CLUSTERING {
         //     cluster_validate_resolution__sparsity,
         //     cluster_validate_resolution__train_size_cells
         // )
-        if (params.utilise_gpu){
-            if (params.cluster_validate_resolution_keras){
+        
+    if (params.cluster_validate_resolution_keras){
 
-            
-            cluster_validate_resolution_keras( 
-                cluster.out.outdir,
-                cluster.out.anndata,
-                cluster.out.metadata,
-                cluster.out.pcs,
-                cluster.out.reduced_dims,
-                cluster.out.clusters,
-                cluster_validate_resolution__sparsity,
-                cluster_validate_resolution__train_size_cells,
-                cluster.out.outdir__reduced_dims
-            )
+        
+        cluster_validate_resolution_keras( 
+            cluster.out.outdir,
+            cluster.out.anndata,
+            cluster.out.metadata,
+            cluster.out.pcs,
+            cluster.out.reduced_dims,
+            cluster.out.clusters,
+            cluster_validate_resolution__sparsity,
+            cluster_validate_resolution__train_size_cells,
+            cluster.out.outdir__reduced_dims
+        )
 
-            plot_resolution_validate(
-                cluster_validate_resolution_keras.out.plot_input.groupTuple()
-            )
-            }
-            
+        plot_resolution_validate(
+            cluster_validate_resolution_keras.out.plot_input.groupTuple()
+        )
         }
+            
+        
 
         SCCAF(cluster.out.outdir,
           cluster.out.anndata,
           cluster.out.clusters,
           sccaf_minacc)
 
-
-
-
         // // Generate UMAPs of the results.
         umap_calculate_and_plot(
             cluster.out.outdir,
@@ -118,28 +115,29 @@ workflow CLUSTERING {
         )
         dummy_output=umap_calculate_and_plot.out.dummy_output
         // // Find marker genes for clusters
-        cluster_markers(
-            cluster.out.outdir,
-            cluster.out.anndata,
-            cluster.out.metadata,
-            cluster.out.pcs,
-            cluster.out.reduced_dims,
-            cluster.out.clusters,
-            cluster_marker__methods
-        )
-
-        // // Find marker genes for clusters using CELLEX
-        cellex_cluster_markers(
-            cluster.out.outdir,
-            cluster.out.anndata
-        )
+        if (params.cluster_markers){
+            cluster_markers(
+                cluster.out.outdir,
+                cluster.out.anndata,
+                cluster.out.metadata,
+                cluster.out.pcs,
+                cluster.out.reduced_dims,
+                cluster.out.clusters,
+                cluster_marker__methods
+            )
 
-        // Prep adata file for cellxgene website
-        prep_cellxgene(
-            cluster.out.outdir,
-            cluster.out.anndata
-        )
+            // // Find marker genes for clusters using CELLEX
+            cellex_cluster_markers(
+                cluster.out.outdir,
+                cluster.out.anndata
+            )
 
+            // Prep adata file for cellxgene website
+            prep_cellxgene(
+                cluster.out.outdir,
+                cluster.out.anndata
+            )
+        }
         emit:
             dummy_output
 
diff --git a/modules/nf-core/modules/estimate_pca_elbow/main.nf b/modules/nf-core/modules/estimate_pca_elbow/main.nf
index 05d3d9ac..2251e3af 100755
--- a/modules/nf-core/modules/estimate_pca_elbow/main.nf
+++ b/modules/nf-core/modules/estimate_pca_elbow/main.nf
@@ -43,10 +43,7 @@ process ESTIMATE_PCA_ELBOW {
     script:
         
         outdir = "${outdir_prev}"
-        log.info("""outdir = ${outdir}""")
-        // from the file__anndata job.
         outfile = "${file__anndata}".minus(".h5ad")
-            .split("-").drop(1).join("-")
         outfile = "${outfile}-knee"
         """
             rm -fr plots
diff --git a/modules/nf-core/modules/normalise_and_pca/main.nf b/modules/nf-core/modules/normalise_and_pca/main.nf
index b0f81031..696128c9 100755
--- a/modules/nf-core/modules/normalise_and_pca/main.nf
+++ b/modules/nf-core/modules/normalise_and_pca/main.nf
@@ -3,6 +3,59 @@ def random_hex(n) {
     Long.toUnsignedString(new Random().nextLong(), n).toUpperCase()
 }
 
+process PCA {
+
+    label 'process_medium'
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://yascp.cog.sanger.ac.uk/public/singularity_images/nf_qc_scrna_v1.img"
+        // /software/hgi/containers/nf_qc_scrna_v1.img
+    } else {
+        container "mercury/nf_qc_scrna:v1"
+    }
+
+    publishDir  path: "${outdir}",
+                saveAs: {filename -> filename.replaceAll("-", "")},
+                mode: "${params.copy_mode}",
+                overwrite: "true"
+    
+    input:
+        path(file__anndata)
+        val(outdir)
+        val(layer)
+
+    output:
+        val(outdir, emit: outdir)
+        val("${outdir}", emit: outdir3)
+        path("adata-normalized_pca.h5ad", emit: anndata)
+        path("adata-metadata.tsv.gz", emit: metadata)
+        path("adata-pcs.tsv.gz", emit: pcs)
+        path(
+            "adata-normalized_pca-counts.h5ad",
+            emit: anndata_filtered_counts
+        )
+        val("${param_details}", emit: param_details)
+        path("plots/*.pdf")
+        path("plots/*.png") optional true
+
+    script:
+
+        """
+        rm -fr plots
+        pca_anndata.py \
+            --h5_anndata ${file__anndata} \
+            --overwrite_x_with_layer ${layer} \
+            --output_file adata \
+            --number_cpu ${task.cpus} \
+            --drop_cell_passes_qc_from_clustering ${params.drop_cell_passes_qc_from_clustering}
+        mkdir plots
+        
+        mv *pdf plots/ 2>/dev/null || true
+        mv *png plots/ 2>/dev/null || true
+        """
+
+}
+
+
 process NORMALISE_AND_PCA {
     // Takes annData object, nomalizes across samples, calculates PCs.
     // NOTE: Once normalization is set, it would be faster to normalize per
@@ -25,7 +78,7 @@ process NORMALISE_AND_PCA {
                 overwrite: "true"
 
     input:
-        val(outdir_prev)
+        
         path(file__anndata)
         val(analysis_mode)
         val(layer)
@@ -36,16 +89,8 @@ process NORMALISE_AND_PCA {
     output:
         
         val(outdir, emit: outdir)
-        
         val("${outdir}", emit: outdir3)
-        path("adata-normalized_pca.h5ad", emit: anndata)
-        path("adata-metadata.tsv.gz", emit: metadata)
-        path("adata-pcs.tsv.gz", emit: pcs)
-        
-        path(
-            "adata-normalized_pca-counts.h5ad",
-            emit: anndata_filtered_counts
-        )
+        path("adata-normalized.h5ad", emit: anndata)
         val("${param_details}", emit: param_details)
         path("plots/*.pdf")
         path("plots/*.png") optional true
@@ -66,19 +111,14 @@ process NORMALISE_AND_PCA {
             cmd__vars_to_regress = "--vars_to_regress ${vars_to_regress}"
         }
 
-        // todo - mo11 - these paths are confusing
-
-        outdir = "${outdir_prev}/normalize=total_count.${param_details}"
+        outdir = "${params.outdir}/clustering/normalize=total_count.${param_details}"
         // Add details on the genes we are exlcuding from hgv list.
         file_vge = "${file__genes_exclude_hvg.getSimpleName()}"
         outdir = "${outdir}.hvg_exclude=${file_vge}"
         // Add details on the scores we are using.
         file_score = "${file__genes_score.getSimpleName()}"
         outdir = "${outdir}.scores=${file_score}"
-
-
         // this is where the subfolder 1 is determined
-
         // Customize command for optional files.
         cmd__genes_exclude_hvg = ""
         if (file__genes_exclude_hvg.name != "no_file__genes_exclude_hvg") {
@@ -89,7 +129,6 @@ process NORMALISE_AND_PCA {
             cmd__genes_score = "--score_genes ${file__genes_score}"
         }
 
-
         """
         rm -fr plots
         0035-scanpy_normalize_pca.py \
@@ -106,26 +145,4 @@ process NORMALISE_AND_PCA {
         mv *pdf plots/ 2>/dev/null || true
         mv *png plots/ 2>/dev/null || true
         """
-        // Old version with bash evaluation of optional commands
-        //
-        // echo "normalize_pca: ${process_info}"
-        // # If there are entries in the variable_genes_exclude file, add it to
-        // # the call.
-        // cmd__vg_exclude="--variable_genes_exclude ${file__genes_exclude_hvg}"
-        // val=\$(cat ${file__genes_exclude_hvg} | wc -l)
-        // if [ \$val -eq 0 ]; then cmd__vg_exclude=""; fi
-        // # If there are entries in the score_genes file, add it to the call.
-        // cmd__score_genes="--score_genes ${file__genes_score}"
-        // val=\$(cat ${file__genes_score} | wc -l)
-        // if [ \$val -eq 0 ]; then cmd__score_genes=""; fi
-        // 0035-scanpy_normalize_pca.py \
-        //     --h5_anndata ${file__anndata} \
-        //     --output_file adata \
-        //     --number_cpu ${task.cpus} \
-        //     ${cmd__vars_to_regress} \
-        //     \${cmd__vg_exclude} \
-        //     \${cmd__score_genes}
-        // mkdir plots
-        // mv *pdf plots/ 2>/dev/null || true
-        // mv *png plots/ 2>/dev/null || true
 }
diff --git a/subworkflows/qc.nf b/subworkflows/qc.nf
index 9ea2ff74..2d88024f 100755
--- a/subworkflows/qc.nf
+++ b/subworkflows/qc.nf
@@ -5,7 +5,7 @@ include {OUTLIER_FILTER} from "$projectDir/modules/nf-core/modules/outlier_filte
 include {PLOT_STATS} from "$projectDir/modules/nf-core/modules/plot_stats/main"
 include {ESTIMATE_PCA_ELBOW} from "$projectDir/modules/nf-core/modules/estimate_pca_elbow/main"
 include {SUBSET_PCS} from "$projectDir/modules/nf-core/modules/subset_pcs/main"
-include {NORMALISE_AND_PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main"
+include {NORMALISE_AND_PCA; PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main"
 include {HARMONY} from "$projectDir/modules/nf-core/modules/harmony/main"
 include {BBKNN} from "$projectDir/modules/nf-core/modules/bbknn/main"
 include {ADD_EXTRA_METADATA_TO_H5AD} from "$projectDir/modules/nf-core/modules/adata_manipulations/main"
@@ -19,7 +19,7 @@ workflow qc {
     take:
         file__anndata_merged
         file__cells_filtered
-        assignments_all_pools
+        gt_outlier_input
     main:
         log.info "--- Running QC metrics --- "
         // if(params.extra_metadata!=''){
@@ -29,26 +29,14 @@ workflow qc {
         // }else{
         //     log.info '''--- No extra metadata to add to h5ad ---'''
         // }
-        file__anndata_merged.map{val1 -> tuple('full', val1)}.set{out1}
 
         CELL_HARD_FILTERS(file__anndata_merged,params.hard_filters_file,params.hard_filters_drop)
         if(params.hard_filters_file != "no_file__file_sample_qc"){
             file__anndata_merged = CELL_HARD_FILTERS.out.anndata
         }
         
-        // Next we define an input channel to outlier filtering strategy in case if params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''
-        // i.e - if we want to exclude a particular cohort that has been matched by gt match from the adaptive qc we feed this in the outlier_filter()
-        if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){
-            gt_outlier_input = assignments_all_pools
-        }else{
-            gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")
-        }
-
-        file__anndata_merged.subscribe { println "value1: $it" }
-        file__cells_filtered.subscribe { println "value2: $it" }
-        gt_outlier_input.subscribe { println "value3: $it" }
         //FILTERING OUTLIER CELLS
-        if (params.sample_qc.cell_filters.filter_outliers.run_process) {
+        if (params.filter_outliers) {
             log.info """---Running automatic outlier cell filtering.----"""
             OUTLIER_FILTER(
                 params.outdir,
@@ -67,18 +55,28 @@ workflow qc {
         }
 
         
+        if (params.normalise_andata){
+            NORMALISE_AND_PCA(
+                file__anndata_merged,
+                params.mode,
+                params.layer,
+                params.genes_exclude_hvg,
+                params.genes_score,
+                params.reduced_dims.vars_to_regress.value)
+            andata = NORMALISE_AND_PCA.out.anndata
+            outdir = NORMALISE_AND_PCA.out.outdir
+
+        }else{
+            andata = file__anndata_merged
+            outdir = "${params.outdir}"
+            LI4 = Channel.of([1, 'dummy_lisi'])
+        }
 
-        NORMALISE_AND_PCA(params.outdir+'/clustering',
-            file__anndata_merged,
-            params.mode,
-            params.layer,
-            params.genes_exclude_hvg,
-            params.genes_score,
-            params.reduced_dims.vars_to_regress.value)
+        PCA(andata,params.outdir,params.layer)
 
         ESTIMATE_PCA_ELBOW(
-            NORMALISE_AND_PCA.out.outdir,
-            NORMALISE_AND_PCA.out.anndata,
+            PCA.out.outdir,
+            PCA.out.anndata,
             params.reduced_dims.n_dims.add_n_to_estimate
         )
 
@@ -91,21 +89,21 @@ workflow qc {
         }
 
         SUBSET_PCS(
-            NORMALISE_AND_PCA.out.outdir,
-            NORMALISE_AND_PCA.out.anndata,
-            NORMALISE_AND_PCA.out.metadata,
-            NORMALISE_AND_PCA.out.pcs,
-            NORMALISE_AND_PCA.out.param_details,
+            PCA.out.outdir,
+            PCA.out.anndata,
+            PCA.out.metadata,
+            PCA.out.pcs,
+            PCA.out.param_details,
             n_pcs
         )
-        file__anndata_merged.subscribe { println "PLOT_STATS input: $it" }
+        PCA.out.outdir.subscribe { println "outdir input: $it" }
         PLOT_STATS(file__anndata_merged,
                     file__cells_filtered,
                     SUBSET_PCS.out.outdir,
                     SUBSET_PCS.out.anndata,
                     n_pcs)
 
-        file__anndata_merged = NORMALISE_AND_PCA.out.anndata
+        file__anndata_merged = PCA.out.anndata
         
         LI4 = PLOT_STATS.out.LI
 
@@ -120,11 +118,11 @@ workflow qc {
         // "Correct" PCs using Harmony or BBKNN
         if (params.harmony.run_process) {
             HARMONY(
-                NORMALISE_AND_PCA.out.outdir,
-                NORMALISE_AND_PCA.out.anndata,
-                NORMALISE_AND_PCA.out.metadata,
-                NORMALISE_AND_PCA.out.pcs,
-                NORMALISE_AND_PCA.out.param_details,
+                PCA.out.outdir,
+                PCA.out.anndata,
+                PCA.out.metadata,
+                PCA.out.pcs,
+                PCA.out.param_details,
                 n_pcs,
                 Channel.fromList( params.harmony.variables_and_thetas.value)
             )
@@ -187,11 +185,11 @@ workflow qc {
 
         if (params.bbknn.run_process) {
             BBKNN(
-                NORMALISE_AND_PCA.out.outdir,
-                NORMALISE_AND_PCA.out.anndata,
-                NORMALISE_AND_PCA.out.metadata,
-                NORMALISE_AND_PCA.out.pcs,
-                NORMALISE_AND_PCA.out.param_details,
+                PCA.out.outdir,
+                PCA.out.anndata,
+                PCA.out.metadata,
+                PCA.out.pcs,
+                PCA.out.param_details,
                 n_pcs,
                 params.bbknn.batch_variable.value
             )
@@ -256,8 +254,8 @@ workflow qc {
             lisi_input_second = lisi_input_first.mix(lisi_input3)
 
             LISI(
-                NORMALISE_AND_PCA.out.outdir,
-                NORMALISE_AND_PCA.out.metadata,
+                PCA.out.outdir,
+                PCA.out.metadata,
                 params.lisi.variables.value,
                 lisi_input_second.collect()
             )
diff --git a/workflows/yascp.nf b/workflows/yascp.nf
index 6e425d2f..f3009565 100755
--- a/workflows/yascp.nf
+++ b/workflows/yascp.nf
@@ -57,7 +57,7 @@ workflow YASCP {
                 
         if(!params.just_reports){
             // sometimes we just want to rerun report generation as a result of alterations, hence if we set params.just_reports =True pipeline will use the results directory and generate a new reports.
-            if (!params.skip_preprocessing.value){
+            if (!params.skip_preprocessing){
                 // The input table should contain the folowing columns - experiment_id	n_pooled	donor_vcf_ids	data_path_10x_format
                 // prepearing the inputs from a standard 10x dataset folders.
                 prepare_inputs(input_channel)
@@ -125,23 +125,23 @@ workflow YASCP {
             }else{
                 // This option skips all the deconvolution and and takes a preprocessed yascp h5ad file to run the downstream clustering and celltype annotation.
                 log.info '''----Skipping Preprocessing since we already have prepeared h5ad input file----'''
-                file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged)
+                file__anndata_merged = Channel.from(params.file__anndata_merged)
 
 
                 if("${mode}"!='default'){
                     // Here we have rerun GT matching upstream - done for freeze1
                     assignments_all_pools = mode
                 }else{
-                    if (params.skip_preprocessing.file__anndata_merged !=''){
-                        assignments_all_pools = Channel.from(params.skip_preprocessing.gt_match_file)
+                    if (params.file__anndata_merged !=''){
+                        assignments_all_pools = Channel.from(params.gt_match_file)
                     }else{
                         assignments_all_pools = Channel.from("$projectDir/assets/fake_file.fq")
                     }
                 }
                 
-                if (params.skip_preprocessing.file__cells_filtered ==''){
+                if (params.file__cells_filtered ==''){
                     log.info '''--- No cells filtered input ----'''
-                    dummy_filtered_channel(file__anndata_merged,params.skip_preprocessing.id_in)
+                    dummy_filtered_channel(file__anndata_merged,params.id_in)
                     file__cells_filtered = dummy_filtered_channel.out.anndata_metadata
                 }else{
                     file__cells_filtered = Channel.from(params.skip_preprocessing.file__cells_filtered)
@@ -175,7 +175,14 @@ workflow YASCP {
             // ###################################
 
             if (!params.skip_qc){
-                qc(file__anndata_merged,file__cells_filtered,assignments_all_pools) //This runs the Clusterring and qc assessments of the datasets.
+
+                if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){
+                    gt_outlier_input = assignments_all_pools
+                }else{
+                    gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")
+                }
+
+                qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets.
                 process_finish_check_channel = qc.out.LI
                 file__anndata_merged = qc.out.file__anndata_merged
             }else{

From edca4bf42bfd3dd667cc5b849bbb9eefccaa0712 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Thu, 30 Nov 2023 08:50:38 +0000
Subject: [PATCH 7/7] tested

---
 workflows/yascp.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/yascp.nf b/workflows/yascp.nf
index f3009565..1211ba2a 100755
--- a/workflows/yascp.nf
+++ b/workflows/yascp.nf
@@ -176,7 +176,7 @@ workflow YASCP {
 
             if (!params.skip_qc){
 
-                if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){
+                if(params.gt_match_based_adaptive_qc_exclusion_pattern !=''){
                     gt_outlier_input = assignments_all_pools
                 }else{
                     gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")