From 14f817a4ec0441f62545b924ae57eced2b643b4b Mon Sep 17 00:00:00 2001 From: Ruth Eberhardt Date: Tue, 17 Oct 2023 15:18:12 +0100 Subject: [PATCH 1/7] bug fix and add subsample of informative sites --- ...ance_calculations_subsample_informative.py | 910 ++++++++++++++++++ 1 file changed, 910 insertions(+) create mode 100755 bin/concordance_calculations_subsample_informative.py diff --git a/bin/concordance_calculations_subsample_informative.py b/bin/concordance_calculations_subsample_informative.py new file mode 100755 index 00000000..4aae389a --- /dev/null +++ b/bin/concordance_calculations_subsample_informative.py @@ -0,0 +1,910 @@ +#!/usr/bin/env python3 + +__date__ = '2023-05-10' +__version__ = '0.0.1' +import argparse +import sys +import importlib.util +import random +import pickle +import pandas as pd +import gzip +import numpy as np +import time +import multiprocessing as mp +from multiprocessing import Lock +import logging +import os + + +class Concordances: + def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): + self.reset() + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites + self.record_dict={} + + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + def reset(self): + self.cell_concordance_table ={} + + # def get_sites_from_tsv(self, sites_file): + # """ + # get sites frm a tsv file where cols are chrom, pos, id, ref, alt + # assumes no multiallelics + # """ + # sites = set() + # with open(sites_file, 'r') as f: + # lines = f.readlines() + # for l in lines: + # linedata = l.split('\t') + # var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) + # sites.add(var) + # return sites + + + def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + true_discordant = 0 + relaxed_concordant = 0 + relaxed_concordant_informative = 0 + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 + true_discordant_uninformative = 0 + subset_informative_concordant = 0 + subset_informative_discordant = 0 + + #print(self.uninformative_sites) + #print(self.informative_sites) + + #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both + #then filter to informative and uninformative. If uninformative >0 then create a subset of informative + # with the same number of vars (at random) + split_snp_gts=snp_gtypes.str.split("_") + snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) + + split_cellsnp_gts=cellsnp_gtypes.str.split("_") + cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) + + shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) + + shared_informative = shared_gts.intersection(self.informative_sites) + shared_uninformative = shared_gts.intersection(self.uninformative_sites) + # print("shared informative " + str(len(shared_informative))) + # print("shared uninformative " + str(len(shared_uninformative))) + + #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these + #are the sites used for concordance + self.informative_covered = len(shared_informative) + self.uninformative_covered = len(shared_uninformative) + + if len(shared_uninformative) > 0: + #print(len(shared_uninformative)) + # print(len(shared_informative)) + if len(shared_uninformative) <= len(shared_informative): + informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) + else: + informative_subset = set()#if there are more shared uninformative than shared informative we will not subset + # print(informative_subset) + # exit(0) + else: + informative_subset = set() + + # print(informative_subset) + self.informative_subset = informative_subset + + snp_gtypes_set = set(snp_gtypes) + snp_gtypes_set = sorted(snp_gtypes_set) + + cellsnp_gtypes_set = set(cellsnp_gtypes) + cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) + + #for i in range(0, len(snp_gtypes)): + for i in range(0, len(snp_gtypes_set)): + discordant = False + # snp_data = snp_gtypes[i].split('_') + # cellsnp_data = cellsnp_gtypes[i].split('_') + snp_data = snp_gtypes_set[i].split('_') + cellsnp_data = cellsnp_gtypes_set[i].split('_') + + # the below will no longer work due to differing length of input strings + # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] + # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] + + + snp_alleles = [snp_data[4][0], snp_data[4][2]] + cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] + + snp_alleles_set = set(snp_alleles) + cellsnp_alleles_set = set(cellsnp_alleles) + + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) + + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) + exit(1) + else: + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + true_discordant+=1 + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + elif snp_var in self.informative_sites: + true_discordant_informative+=1 + else: + relaxed_concordant+=1 + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + elif snp_var in self.informative_sites: + relaxed_concordant_informative+=1 + + + if len(shared_uninformative) > 0: + if snp_var in informative_subset: + if discordant == True: + subset_informative_discordant+=1 + else: + subset_informative_concordant+=1 + + # print("conc inf " + str(relaxed_concordant_informative)) + # print("disc inf " + str(true_discordant_informative)) + + return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative, subset_informative_concordant, subset_informative_discordant + + + def read_condordance(self, expected_vars, cell_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + # print(len(expected_vars)) + # print(len(cell_vars)) + + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) + + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + #split to informative and uninformative sites + mask_i = cell_vars['ids'].isin(self.informative_sites) + cell_vars_informative = cell_vars[mask_i] + mask_u = cell_vars['ids'].isin(self.uninformative_sites) + cell_vars_uninformative = cell_vars[mask_u] + informative_sites = len(cell_vars_informative) + uninformative_sites = len(cell_vars_uninformative) + mask_s = cell_vars['ids'].isin(self.informative_subset) + cell_vars_informative_subset = cell_vars[mask_s] + informative_subset_sites = len(cell_vars_informative_subset) + # print("Informative sites " + str(len(self.informative_sites))) + # print("uninformative sites " + str(len(self.uninformative_sites))) + # print("informative sites in cell vars " + str(len(cell_vars_informative))) + # print("uninformative sites in cell vars " + str(len(cell_vars_uninformative))) + # print("Informative subset " + str(informative_subset_sites)) + # print(cell_vars_informative_subset) + # exit(0) + + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + total_dp_inf = cell_vars_informative['DP'].sum() + total_oth_inf = cell_vars_informative['OTH'].sum() + total_reads_informative = total_dp_inf + total_oth_inf + total_dp_uninf = cell_vars_uninformative['DP'].sum() + total_oth_uninf = cell_vars_uninformative['OTH'].sum() + total_reads_uninformative = total_dp_uninf + total_oth_uninf + total_dp_inf_subset = cell_vars_informative_subset['DP'].sum() + total_oth_inf_subset = cell_vars_informative_subset['OTH'].sum() + total_reads_informative_subset = total_dp_inf_subset + total_oth_inf_subset + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)] + cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)] + cell_vars_inf_subset_2 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + ad_hom_ref_inf = cell_vars_inf_2['AD'].sum() + oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() + discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf + ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum() + oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() + discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf + ad_hom_ref_inf_subset = cell_vars_inf_subset_2['AD'].sum() + oth_hom_ref_inf_subset = cell_vars_inf_subset_2['OTH'].sum() + discordant_hom_ref_informative_subset = ad_hom_ref_inf_subset + oth_hom_ref_inf_subset + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)] + cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)] + cell_vars_inf_subset_3 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + discordant_het_informative = cell_vars_inf_3['OTH'].sum() + discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum() + discordant_het_informative_subset = cell_vars_inf_subset_3['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)] + cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)] + cell_vars_inf_subset_4 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_alt_sites)] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + ad_hom_alt_inf = cell_vars_inf_4['AD'].sum() + dp_hom_alt_inf = cell_vars_inf_4['DP'].sum() + oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum() + discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf + ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum() + dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum() + oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum() + discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf + ad_hom_alt_inf_subset = cell_vars_inf_subset_4['AD'].sum() + dp_hom_alt_inf_subset = cell_vars_inf_subset_4['DP'].sum() + oth_hom_alt_inf_subset = cell_vars_inf_subset_4['OTH'].sum() + discordant_hom_alt_informative_subset = (dp_hom_alt_inf_subset + oth_hom_alt_inf_subset) - ad_hom_alt_inf_subset + + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + discordant_reads_informative = discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative + discordant_reads_uninformative = discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative + discordant_reads_informative_subset = discordant_hom_ref_informative_subset + discordant_het_informative_subset + discordant_hom_alt_informative_subset + + return total_sites, self.informative_covered, self.uninformative_covered, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset + + + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols + + cell_vars_norm = self.norm_genotypes(cell_vars) + + if len(cell_vars_norm) > 0: + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + # print(cell_vars_norm) + # print(expected_vars2) + # print(cell_vars2) + # exit(0) + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + # print(len(disc2)) + # exit(0) + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites = ';'.join(disc2['expected_retrieved']) + #find truly discordant sites + #true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(expected_vars2[0], cell_vars2[0]) + #find discordant reads + total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset = self.read_condordance(expected_vars2, cell_vars2) + else: + Total_Overlapping_sites = set() + Concordant_Sites = set() + Discordant_sites = set() + disc_sites = '' + true_discordant_count = 0 + relaxed_concordant_count = 0 + total_sites = 0 + discordant_reads = 0 + + informative_subset_sites = 0 + subset_informative_sites_concordant_count = 0 + subset_informative_sites_discordant_count = 0 + total_reads_informative_subset = 0 + discordant_reads_informative_subset = 0 + relaxed_concordant_informative_count = 0 + relaxed_concordant_uninformative_count = 0 + true_discordant_informative_count = 0 + true_discordant_uninformative_count = 0 + total_reads = 0 + total_reads_informative = 0 + total_reads_uninformative = 0 + discordant_reads = 0 + discordant_reads_informative = 0 + discordant_reads_uninformative = 0 + informative_sites = 0 + uninformative_sites = 0 + + #print(total_sites, informative_sites, uninformative_sites, relaxed_concordant_informative_count, true_discordant_informative_count, self.informative_covered, self.uninformative_covered) + #exit(0) + + return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + def append_results_cell_concordances(self,result): + count=result[13] + try: + percent_concordant = result[2]/(result[3]+result[2])*100 + except: + percent_concordant = 0 + + try: + percent_discordant = result[3]/(result[3]+result[2])*100 + except: + percent_discordant = 0 + + try: + percent_relaxed_concordant = result[4]/(result[4]+result[5])*100 + except: + percent_relaxed_concordant = 0 + + try: + percent_strict_discordant = result[5]/(result[4]+result[5])*100 + except: + percent_strict_discordant = 0 + + try: + read_discordance = result[21]/result[15] + except: + read_discordance = 0 + + donor = result[1] + cohort = 'UNKNOWN' + donor_split = donor.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + + print(count) + self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], + 'GT 2':result[1], + 'cohort': cohort, + 'Nr_Concordant':result[2], + 'Nr_Discordant':result[3], + 'Nr_Relaxed_concordant':result[4], + 'Nr_strict_discordant':result[5], + 'Percent Concordant':percent_concordant, + 'Percent Discordant':percent_discordant, + 'Percent_relaxed_concordant': percent_relaxed_concordant, + 'Percent_strict_discordant': percent_strict_discordant, + 'Nr_concordant_informative': result[6], + 'Nr_concordant_uninformative': result[7], + 'Nr_discordant_informative': result[8], + 'Nr_discordant_uninformative': result[9], + 'NrTotal_Overlapping_sites_between_two_genotypes':result[10], + 'Nr_donor_distinct_sites_within_pool_individuals':result[12], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11], + 'Discordant_Site_Identities':result[14], + 'Total_sites': result[15], + 'Total_informative_sites': result[16], + 'Total_uninformative_sites': result[17], + 'Total_reads': result[18], + 'Total_reads_informative': result[19], + 'Total_reads_uninformative': result[20], + 'Discordant_reads': result[21], + 'Discordant_reads_informtive': result[22], + 'Discordant_reads_uninformtive': result[23], + 'Discordant_reads_by_n_sites': read_discordance, + 'informative_subset_sites': result[24], + 'subset_informative_sites_concordant_count': result[25], + 'subset_informative_sites_discordant_count': result[26], + 'total_reads_informative_subset': result[27], + 'discordant_reads_informative_subset': result[28] + } + #informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset] + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.cell_concordance_table,count) + self.reset() + _="" + + def combine_written_files(self):#this one is for concordance class + to_export = self.cell_concordance_table + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + to_export[k1]=loaded_dict[k1] + os.remove(val1) + return to_export + + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants + + pool = mp.Pool(cpus) + count = 0 + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + if (donor_gt_match=='NONE'): + continue + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) + try: + expected_vars = exclusive_don_variants[donor_gt_match] + except: + _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances' + continue + expected_vars_norm = self.norm_genotypes(expected_vars) + try: + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] + except: + continue + + for cell1 in Cells_to_keep_pre: + count+=1 + # if count>800: + # break + cell_vars = exclusive_cell_variants[cell1] + # cell_vars_dp = exclusive_cell_variants_dp[cell1] + + self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} + # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) + result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count) + self.append_results_cell_concordances(result1) + + pool.close() + pool.join() + output = self.combine_written_files() + return output + + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): + Nr_donor_distinct_sites = len(dds) + Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + Nr_Concordant = len(Concordant_Sites) + #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) + #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + + return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,relaxed_concordant_count, true_discordant_count, relaxed_concordant_informative_count, + relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, + Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, + uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, + informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset] + + +class VCF_Loader: + + def __init__(self, vcf_file, biallelic_only=True, + sparse=False, format_list=['GT']): + self.vcf_file = vcf_file + self.load_sample = True + self.biallelic_only = biallelic_only + self.sparse = sparse + self.record_dict={} + self.reset() + self.format_list = format_list + self.exclusive_donor_variants = {} + self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update + self.last_count=-1 + self.reset_c() + + def reset_c(self): + self.record_times=0 + + def reset(self): + self.exclusive_donor_variants ={} + + def myfunc(self): + print(f"Hello my name is {self.biallelic_only}" ) + + def load_sample_mp(self,line,obs_ids,count,format_list): + ''' + takes VCF lines and extracts all format fields for those where GT !='.' + ''' + list_val = line.rstrip().split("\t") #[:5] #:8 + idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called + if len(list_val[3]) > 1 or len(list_val[4]) > 1: + # CURRENTLY DEALS ONLY WITH BIALELIC + print(f'{idx} var not bialelic') + elif list_val[3] == 'A' and list_val[4] == 'G':#remove A>G + pass + elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C + + pass + else: + list_val2 = list_val[9:] + obs = pd.DataFrame(obs_ids) + lv = pd.DataFrame(list_val2) + lv_proc =lv[0].str.split(':').str[idx] + gt_exists = lv_proc[lv_proc != '.'] + idx2 = gt_exists.index + obs_with_gt = obs.loc[idx2.values] + obs_with_gt = list(obs_with_gt[0].values) + list_val_with_gt = lv.loc[idx2.values] + list_val_with_gt = list(list_val_with_gt[0].values) + random.seed(count) + c = list(zip(obs_with_gt, list_val_with_gt)) + random.shuffle(c) + obs_with_gt, list_val_with_gt = zip(*c) + # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count]) + + return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + + def append_results(self,result): + # exclusive_donor_variants + obs_with_gt= result[0] + list_val_with_gt= result[1] + idx = result[2] + list_val = result[3] + count = result[4] + format_list = result[5]#list of required format fields + #get indexes of required format fields (apart from GT which has already been taken care of) + additional_field_idxs = [] + for fmt in format_list: + if not fmt == 'GT': + idx_addn = find(list_val[8].split(':'), fmt)[0] + additional_field_idxs.append(idx_addn) + # print(additional_field_idxs) + # exit(0) + + count11=0 + # r = random.random() + # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer. + # every 500 itterations we push the data to a dictionary, later we combine these together. + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.exclusive_donor_variants,count) + self.reset() + self.reset_c() + + for ob_id in obs_with_gt: + donor_loc_in_list = count11 + alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx] + #append any additional format fields to alleles + if len(additional_field_idxs) > 0: + for idx_addnl in additional_field_idxs: + fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl] + alleles = alleles + '_' + fmt_val + + if not alleles.startswith('.'): + ids = "_".join([list_val[x] for x in [0, 1, 3, 4]]) + donor_var = f"{ids}_{alleles}" + while ob_id in self.curently_pushing: + time.sleep(r*0.01) + self.curently_pushing.append(ob_id) + try: + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + except: + self.exclusive_donor_variants[ob_id]=set() + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + self.curently_pushing.remove(ob_id) + # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1'] + count11+=1 + + def combine_written_files(self):#this is for VCF loader class + to_export = self.exclusive_donor_variants + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + try: + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + except: + to_export[k1]=set() + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + os.remove(val1) + return to_export + + + def load_VCF_batch_paralel(self): + """ + Load whole VCF file by utilising multiple cores to speed up loading of large cell files + ------------------- + Initially designed to load VCF from cellSNP output, requiring + 1) all variants have the same format list; + 2) a line starting with "#CHROM", with sample ids. + If these two requirements are satisfied, this function also supports general + VCF files, e.g., genotype for multiple samples. + + Note, it may take a large memory, please filter the VCF with bcftools first. + """ + + vcf_file = self.vcf_file + biallelic_only = self.biallelic_only + load_sample= self.load_sample + sparse = self.sparse + format_list= self.format_list + pool = mp.Pool(cpus) + + + import time + if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz": + infile = gzip.open(vcf_file, "rb") + is_gzip = True + else: + infile = open(vcf_file, "r") + is_gzip = False + + FixedINFO = {} + contig_lines = [] + comment_lines = [] + var_ids, obs_ids, obs_dat = [], [], [] + count=0 #57077 + for line in infile: + count+=1 + # if count>10000: + # break + if is_gzip: + line = line.decode('utf-8') + if line.startswith("#"): + if line.startswith("##contig="): + contig_lines.append(line.rstrip()) + if line.startswith("#CHROM"): + if load_sample: + obs_ids = line.rstrip().split("\t")[9:] + for ob_id in obs_ids: + self.exclusive_donor_variants[ob_id]=set() + key_ids = line[1:].rstrip().split("\t")[:8] + for _key in key_ids: + FixedINFO[_key] = [] + else: + comment_lines.append(line.rstrip()) + else: + pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results) + del line + self.last_count=count + pool.close() + pool.join() + + output = self.combine_written_files() + return output + + +"""Run CLI.""" + +def get_options(): + ''' + Get options from the command line + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__)) + parser.add_argument('--cpus', action='store', required=True, type=int) + parser.add_argument('--cell_vcf', action='store', required=True) + parser.add_argument('--cell_assignments', action='store', required=True) + parser.add_argument('--donor_assignments', action='store', required=True) + parser.add_argument('--gt_match_vcf', action='store', required=True) + parser.add_argument('--expected_vcf', action='store', required=True) + parser.add_argument('--informative_sites', action='store', required=True) + parser.add_argument('--uninformative_sites', action='store', required=True) + parser.add_argument('--outfile', action='store', required=True) + args = parser.parse_args() + + return args + + +def get_sites_from_tsv(sites_file): + """ + get sites frm a tsv file where cols are chrom, pos, id, ref, alt + assumes no multiallelics + """ + sites = set() + with open(sites_file, 'r') as f: + lines = f.readlines() + for l in lines: + linedata = l.split('\t') + var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) + sites.add(var) + return sites + + +def find(lst, a): + return [i for i, x in enumerate(lst) if x==a ] + + +def norm_genotypes(expected_vars): + expected_vars = pd.DataFrame(expected_vars) + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + +def donor_exclusive_sites(exclusive_don_variants2): + # Here we generate a function for determining the sites that are donor exclusive + donor_distinct_sites = {} + for col1 in exclusive_don_variants2.keys(): + comparisons =[] + to_compare = [] + for col2 in exclusive_don_variants2.keys(): + if col1==col2: + # we set this as the unique entry + # print('1') + to_compare = set(exclusive_don_variants2[col2]) + else: + # We combine all the variants in one list + comparisons+=list(exclusive_don_variants2[col2]) + # print('2') + # print('comparison') + comparisons_all = set(comparisons) + comparisons_all_norm = norm_genotypes(comparisons_all) + comparisons_all=set(comparisons_all_norm['combo']) + + to_compare = set(to_compare) + to_compare_norm = norm_genotypes(to_compare) + to_compare=set(to_compare_norm['combo']) + # Make sure we account for hap types - phased/unphased + distinct_donor_sites = to_compare - comparisons_all + donor_distinct_sites[col1]=distinct_donor_sites + # Perform the distinct set function. + return donor_distinct_sites + +debug=False + +if __name__ == "__main__": + + options = get_options() + cpus = options.cpus + outfile = options.outfile + cell_vcf=options.cell_vcf + donor_assignments=options.donor_assignments + gt_match_vcf=options.gt_match_vcf + expected_vcf=options.expected_vcf + cell_assignments=options.cell_assignments + informative_sites_file = options.informative_sites + uninformative_sites_file = options.uninformative_sites + + informative_sites = get_sites_from_tsv(informative_sites_file) + uninformative_sites = get_sites_from_tsv(uninformative_sites_file) + + exclusive_donor_variants = {} #This is where results are populated when mp process i used. + curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update + All_Results={} + cell_concordance_table = {} + + donor_assignments_table = pd.read_csv(donor_assignments) + cell_assignments_table = pd.read_csv(cell_assignments,sep='\t') + + if debug: + with open('tmp_GT_Expected_variants.pkl', 'rb') as f: + GT_Expected_variants = pickle.load(f) + with open('tmp_GT_Matched_variants.pkl', 'rb') as f: + GT_Matched_variants = pickle.load(f) + with open('tmp_exclusive_cell_variants.pkl', 'rb') as f: + exclusive_cell_variants = pickle.load(f) + with open('tmp_donor_distinct_sites.pkl', 'rb') as f: + donor_distinct_sites = pickle.load(f) + with open('tmp_exclusive_don_variants.pkl', 'rb') as f: + exclusive_don_variants = pickle.load(f) + else: + print('---Loading genotype VCF----') + if (os.path.exists(gt_match_vcf)): + loader2 = VCF_Loader(gt_match_vcf, biallelic_only=True, + sparse=False, format_list=['GT']) + GT_Matched_variants = loader2.load_VCF_batch_paralel() + del loader2 + else: + GT_Matched_variants = {} + + with open(f'tmp_GT_Matched_variants.pkl', 'wb') as f: + pickle.dump(GT_Matched_variants, f) + + print('---Loading cell VCF----') + tic = time.perf_counter() + loader1 = VCF_Loader(cell_vcf, biallelic_only=True, + sparse=False, format_list=['GT', 'DP', 'AD', 'OTH']) + exclusive_cell_variants = loader1.load_VCF_batch_paralel() + del loader1 + toc = time.perf_counter() + + with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f: + pickle.dump(exclusive_cell_variants, f) + print(f"Loading took {toc - tic:0.4f} seconds") + + print('---Loading expected VCF----') + loader3 = VCF_Loader(expected_vcf, biallelic_only=True, + sparse=False, format_list=['GT']) + GT_Expected_variants = loader3.load_VCF_batch_paralel() + del loader3 + + with open(f'tmp_GT_Expected_variants.pkl', 'wb') as f: + pickle.dump(GT_Expected_variants, f) + + print('---Variant files loaded----') + + exclusive_don_variants = GT_Expected_variants.keys() + content = [x for x in exclusive_don_variants if not x.startswith('donor')] + GT_Expected_variants = {key: GT_Expected_variants[key] for key in content} + + exclusive_don_variants = GT_Matched_variants.keys() + content = [x for x in exclusive_don_variants if not x.startswith('donor')] + GT_Matched_variants = {key: GT_Matched_variants[key] for key in content} + + exclusive_don_variants = GT_Expected_variants + for key in GT_Matched_variants.keys(): + if key in exclusive_don_variants.keys(): + _='' + else: + exclusive_don_variants[key]=GT_Matched_variants[key] + + with open(f'tmp_exclusive_don_variants.pkl', 'wb') as f: + pickle.dump(exclusive_don_variants, f) + donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants) + with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f: + pickle.dump(donor_distinct_sites, f) + + print('---donor_distinct_sites calculated----') + + conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites, informative_sites, uninformative_sites) + cell_concordance_table = conc1.conc_table() + + # cell_concordance_table = conc_table(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants) + result = pd.DataFrame(cell_concordance_table).T + + if len(result)>0: + result.to_csv(outfile,sep='\t') + print('Processing Done') + From 3c0ea91a5cf7ce9c73c5627b032ca808ab001d12 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Tue, 14 Nov 2023 17:53:43 +0000 Subject: [PATCH 2/7] combined concordance calculations in one base file --- bin/concordance_calculations.py | 1220 +++++++++++++++++ ...calculations_donor_exclusive_read_level.py | 598 ++++---- ...ations_donor_exclusive_read_level_noA2G.py | 735 +++++----- ...ance_calculations_subsample_informative.py | 947 ++++++------- ...t_sites_in_other_donors_find_best_donor.py | 39 +- ..._discordant_sites_in_other_donors_noA2G.py | 715 ++++++---- 6 files changed, 2799 insertions(+), 1455 deletions(-) create mode 100644 bin/concordance_calculations.py diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py new file mode 100644 index 00000000..8d0c41f2 --- /dev/null +++ b/bin/concordance_calculations.py @@ -0,0 +1,1220 @@ +#!/usr/bin/env python3 + +#take cellSNP VCF and genotype VCF for the donors in a pool +# for each cell in the cellSNP VCF identify discordant sites (using the relaxed concordance) +# look for these sites in genotypes of all members of the pool +# output: +# cell id +# assigned donor +# cohort of assigned donor +# number of discordant sites +# total AD over discordant sites +# list of donors in the pool, how many of the discordant sites are found in the donor, cohort each belongs to +# list of discordant sites + +__date__ = '2023-14-11' +__version__ = '0.0.1' +import argparse +import sys +import importlib.util +import pickle +import pandas as pd +import gzip +import random +import numpy as np +import time +import multiprocessing as mp +from multiprocessing import Lock +import logging +import os +import gzip +import time +pd.options.mode.chained_assignment = None + +class Concordances: + + def reset(self): + self.cell_concordance_table ={} + + def reset2(self): + self.other_donor_comp =[] + + def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): + self.reset() + self.reset2() + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites + self.record_dict={} + + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + + + def get_strict_discordance(self, expected_vars, cell_vars): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites and list of discordant sites1 + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + snp_gtypes = expected_vars[0] + cellsnp_gtypes = cell_vars[0] + true_discordant = 0 + relaxed_concordant = 0 + relaxed_concordant_informative = 0 + relaxed_concordant_informative_ids = [] + relaxed_concordant_uninformative_ids = [] + true_discordant_uninformative_ids =[] + true_discordant_informative_ids=[] + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 + true_discordant_uninformative = 0 + discordant_vars = [] + concordant_vars = [] + subset_informative_concordant = 0 + subset_informative_discordant = 0 + + #print(self.uninformative_sites) + #print(self.informative_sites) + + #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both + #then filter to informative and uninformative. If uninformative >0 then create a subset of informative + # with the same number of vars (at random) + split_snp_gts=snp_gtypes.str.split("_") + snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) + + split_cellsnp_gts=cellsnp_gtypes.str.split("_") + cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) + + shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) + + shared_informative = shared_gts.intersection(self.informative_sites) + shared_uninformative = shared_gts.intersection(self.uninformative_sites) + # print("shared informative " + str(len(shared_informative))) + # print("shared uninformative " + str(len(shared_uninformative))) + + #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these + #are the sites used for concordance + self.informative_covered = len(shared_informative) + self.uninformative_covered = len(shared_uninformative) + + if len(shared_uninformative) > 0: + #print(len(shared_uninformative)) + # print(len(shared_informative)) + if len(shared_uninformative) <= len(shared_informative): + informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) + else: + informative_subset = set()#if there are more shared uninformative than shared informative we will not subset + # print(informative_subset) + # exit(0) + else: + informative_subset = set() + + # print(informative_subset) + self.informative_subset = informative_subset + + snp_gtypes_set = set(snp_gtypes) + snp_gtypes_set = sorted(snp_gtypes_set) + + cellsnp_gtypes_set = set(cellsnp_gtypes) + cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) + + #for i in range(0, len(snp_gtypes)): + for i in range(0, len(snp_gtypes_set)): + discordant = False + # snp_data = snp_gtypes[i].split('_') + # cellsnp_data = cellsnp_gtypes[i].split('_') + snp_data = snp_gtypes_set[i].split('_') + cellsnp_data = cellsnp_gtypes_set[i].split('_') + + # the below will no longer work due to differing length of input strings + # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] + # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] + + snp_alleles = [snp_data[4][0], snp_data[4][2]] + cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] + + snp_alleles_set = set(snp_alleles) + cellsnp_alleles_set = set(cellsnp_alleles) + + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) + + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) + exit(1) + else: + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + true_discordant+=1 + discordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + true_discordant_uninformative_ids.append(snp_var) + elif snp_var in self.informative_sites: + true_discordant_informative+=1 + true_discordant_informative_ids.append(snp_var) + else: + relaxed_concordant+=1 + concordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + relaxed_concordant_uninformative_ids.append(snp_var) + elif snp_var in self.informative_sites: + relaxed_concordant_informative+=1 + relaxed_concordant_informative_ids.append(snp_var) + if len(shared_uninformative) > 0: + if snp_var in informative_subset: + if discordant == True: + subset_informative_discordant+=1 + else: + subset_informative_concordant+=1 + # true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count + cell_vars2 = cell_vars.set_index('ids') + disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x']) + df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites_string = ';'.join(disc2['expected_retrieved']) + return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string + + def read_concordance_calc(self,expected_vars,cell_vars): + + # This is a wrapper to add up the discordant reads in the cellsnp file. + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + + # Total analysis + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + + return total_reads,total_dp,total_oth,discordant_reads + + def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) + + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + + + + # Total + total_reads,total_dp,total_oth,discordant_reads = self.read_concordance_calc(expected_vars,cell_vars) + + # uninformative + cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)] + total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative = self.read_concordance_calc(expected_vars,cell_vars_uninformative) + + # informative + cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)] + total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative = self.read_concordance_calc(expected_vars,cell_vars_informative) + + # Split into concordant and discordant sites + # concordant + concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))] + total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites = self.read_concordance_calc(expected_vars,concordant_sites) + + # discordant + discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))] + total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites = self.read_concordance_calc(expected_vars,discordant_sites) + + # Subset analysis + cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)] + total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset = self.read_concordance_calc(expected_vars,cell_vars_informative_subset) + + return total_sites, \ + self.informative_covered, \ + self.uninformative_covered, \ + total_reads, \ + discordant_reads, \ + total_reads_informative, \ + discordant_reads_informative, \ + total_reads_uninformative, \ + discordant_reads_uninformative, \ + total_reads_informative_subset, \ + discordant_reads_informative_subset, \ + total_reads_for_concordant_sites, \ + discordant_reads_for_concordant_sites, \ + total_reads_for_discconcordant_sites, \ + discordant_reads_for_discconcordant_sites + + + + def get_discordance(self,expected_vars2,cell_vars2): + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites = ';'.join(disc2['expected_retrieved']) + return Concordant_Sites,Discordant_sites,disc_sites + + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols + + cell_vars_norm = self.norm_genotypes(cell_vars) + + if len(cell_vars_norm) > 0: + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + + # Find exact discordant sites + Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2) + #find truly discordant sites + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2) + #find discordant reads + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + discordant_reads, \ + total_reads_informative, \ + discordant_reads_informative, \ + total_reads_uninformative, \ + discordant_reads_uninformative, \ + total_reads_informative_subset, \ + discordant_reads_informative_subset, \ + total_reads_for_concordant_sites, \ + discordant_reads_for_concordant_sites, \ + total_reads_for_discconcordant_sites, \ + discordant_reads_for_discconcordant_sites = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars) + + discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}" + discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}" + discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}" + discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}" + + # sanity checks + if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites: + print("Error: total reads dont add up ") + exit(1) + if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites: + print("Error: discordant reads dont add up ") + exit(1) + + + else: + Total_Overlapping_sites = set() + Concordant_Sites = set() + Discordant_sites = set() + disc_sites = '' + true_discordant_count = 0 + relaxed_concordant_count = 0 + total_sites = 0 + + discordant_reads = 0 + + return Concordant_Sites, \ + Discordant_sites, \ + Total_Overlapping_sites, \ + disc_sites_string, \ + cell_vars_norm, \ + true_discordant_count, \ + relaxed_concordant_count, \ + relaxed_concordant_informative_count, \ + relaxed_concordant_uninformative_count, \ + true_discordant_informative_count, \ + true_discordant_uninformative_count, \ + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + total_reads_informative, \ + total_reads_uninformative, \ + discordant_reads, \ + discordant_reads_informative, \ + discordant_reads_uninformative, \ + discordant_vars, \ + concordant_vars, \ + discordant_read_fraction_in_concordant_sites, \ + discordant_read_fraction_in_discordant_sites, discordant_reads_uninformative_fraction, discordant_reads_informative_fraction + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + return + + # def append_results_cell_concordances(self,result): + def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table): + other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances + count=result['count'] + try: + percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 + except: + percent_concordant = 0 + + try: + percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 + except: + percent_discordant = 0 + + try: + percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 + except: + percent_relaxed_concordant = 0 + + try: + percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 + except: + percent_strict_discordant = 0 + + try: + read_discordance = result['discordant_reads']/result['total_sites'] + except: + read_discordance = 0 + + cohort = 'UNKNOWN' + donor_split = result['donor_gt_match'].split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + + same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance'] + if not same_as_asigned_donor: + same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance'] + + cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'], + 'GT 2':result['donor_gt_match'], + 'cohort': cohort, + + 'Nr_Concordant':result['Nr_Concordant'], + 'Nr_Discordant':result['Nr_Discordant'], + 'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'], + 'Nr_strict_discordant':result['true_discordant_count'], + 'Percent Concordant':percent_concordant, + 'Percent Discordant':percent_discordant, + 'Percent_relaxed_concordant': percent_relaxed_concordant, + 'Percent_strict_discordant': percent_strict_discordant, + 'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']), + 'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']), + 'Nr_discordant_informative': len(result['true_discordant_informative_count']), + 'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']), + 'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'], + 'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'], + 'Total_sites': result['total_sites'], + 'Total_informative_sites': result['informative_sites'], + 'Total_uninformative_sites': result['uninformative_sites'], + 'Total_reads': result['total_reads'], + 'Total_reads_informative': result['total_reads_informative'], + 'Total_reads_uninformative': result['total_reads_uninformative'], + 'Discordant_reads': result['discordant_reads'], + 'Discordant_reads_informtive': result['discordant_reads_informative'], + 'Discordant_reads_uninformtive': result['discordant_reads_uninformative'], + 'Discordant_reads_by_n_sites': read_discordance, + + 'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']), + 'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'], + 'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'], + 'Concordant_Site_Identities':result['Concordant_Site_Identities'], + 'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'], + 'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'], + 'same_as_asigned_donor':same_as_asigned_donor, + 'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'], + 'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'], + 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], + 'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], + 'Discordant_Site_Identities':result['discordant_sites'], + } + + return [cell_concordance_table,other_donor_concordance_table] + + # def combine_written_files(self):#this one is for concordance class + # to_export = self.cell_concordance_table + # for val1 in self.record_dict.values(): + # # here remove the int files. + # print(f"merging temp file: {val1}") + # with open(val1, 'rb') as f: + # loaded_dict = pickle.load(f) + # for k1 in loaded_dict.keys(): + # to_export[k1]=loaded_dict[k1] + # os.remove(val1) + # return to_export + + + def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class + to_export = exclusive_donor_variants + for val1 in record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + self.other_donor_comp = self.other_donor_comp+ loaded_dict + os.remove(val1) + return self.other_donor_comp + + def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class + to_export = exclusive_donor_variants + for val1 in record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + try: + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + except: + to_export[k1]=set() + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + os.remove(val1) + return to_export + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm): + donor_concordance_table = {} + other_donor_concordance_table = [] + for cell1 in Cells_to_keep_pre: + count+=1 + + cell_vars = exclusive_cell_variants[cell1] + result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data) + cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table) + if count>300: + break + # here we should write these independently to the files + if (count % 50 == 0): + self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}") + other_donor_concordance_table = [] + + + self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}") + output2 = self.combine_written_lists(self.other_donor_comp,self.record_dict) + pd.DataFrame(output2).sort_values(by=['cell']).to_csv(f'{donor_gt_match}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False) + del output2 + return donor_concordance_table + + def combine_concordances(self,result): + + self.cell_concordance_table = {**self.cell_concordance_table, **result} + + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants + donor_list = exclusive_don_variants.keys() + pool = mp.Pool(cpus) + count = 0 + + + #create a list of variants that are on each donor genotype file + vars_per_donor_gt = {} + for don_id in donor_list: + donor_gt_vars = list(exclusive_don_variants[don_id]) + donor_gt_vars = pd.DataFrame(donor_gt_vars) + donor_gt_vars = self.norm_genotypes(donor_gt_vars) + donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0'] + donor_gt_varids = list(donor_gt_vars['ids']) + vars_per_donor_gt[don_id] = donor_gt_varids + + #work out what cohort each donor belongs to + donor_cohorts = {} + for don_id in donor_list: + cohort = 'UNKNOWN' + donor_split = don_id.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + donor_cohorts[don_id] = cohort + + all_donor_data={} + # here we calvculate all the expected donor datasets + for row1 in exclusive_don_variants.keys(): + # donor_in_question = row1['donor_query'] + donor_gt_match = row1 + expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match] + expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) + all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor + + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + # if i>4: + # continue + if (donor_gt_match=='NONE'): + continue + try: + donor_gt_match_cohort = donor_cohorts[donor_gt_match] + except: + continue + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) + expected_vars = exclusive_don_variants[donor_gt_match] + expected_vars_norm = self.norm_genotypes(expected_vars) + try: + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] + except: + continue + if cpus==1: + result = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm) + self.combine_concordances(result) + else: + pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances) + + pool.close() + pool.join() + + # output = self.combine_written_files(self.cell_concordance_table,self.record_dict) + + return self.cell_concordance_table + + + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data): + + Concordant_Sites, \ + Discordant_sites, \ + Total_Overlapping_sites, \ + discordant_sites, \ + cell_vars_norm, \ + true_discordant_count, \ + relaxed_concordant_count, \ + relaxed_concordant_informative_count, \ + relaxed_concordant_uninformative_count, \ + true_discordant_informative_count, \ + true_discordant_uninformative_count, \ + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + total_reads_informative, \ + total_reads_uninformative, \ + discordant_reads, \ + discordant_reads_informative, \ + discordant_reads_uninformative, \ + discordant_vars, \ + concordant_vars, \ + discordant_read_fraction_in_concordant_sites, \ + discordant_read_fraction_in_discordant_sites, \ + discordant_reads_uninformative_fraction, \ + discordant_reads_informative_fraction = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + + total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count + dds = self.donor_distinct_sites[donor_gt_match] + Nr_donor_distinct_sites = len(dds) + Nr_Concordant = len(Concordant_Sites) + Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites))) + Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + #Quantify donor variation in other donors + discordant_vars_in_pool = [] + donor_table_of_concordances = [] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set() + for donor in vars_per_donor_gt: + + expected_vars_norm_of_other_donor = all_donor_data[donor] + + Concordant_Sites_otherDonor, \ + Discordant_sites_otherDonor, \ + Total_Overlapping_sites_otherDonor, \ + discordant_sites_otherDonor, \ + cell_vars_norm_otherDonor, \ + true_discordant_count_otherDonor, \ + relaxed_concordant_count_otherDonor, \ + relaxed_concordant_informative_count_otherDonor, \ + relaxed_concordant_uninformative_count_otherDonor, \ + true_discordant_informative_count_otherDonor, \ + true_discordant_uninformative_count_otherDonor, \ + total_sites_otherDonor, \ + informative_sites_otherDonor, \ + uninformative_sites_otherDonor, \ + total_reads_otherDonor, \ + total_reads_informative_otherDonor, \ + total_reads_uninformative_otherDonor, \ + discordant_reads_otherDonor, \ + discordant_reads_informative_otherDonor, \ + discordant_reads_uninformative_otherDonor, \ + discordant_vars_otherDonor, \ + concordant_vars_otherDonor, \ + discordant_read_fraction_in_concordant_sites_otherDonor, \ + discordant_read_fraction_in_discordant_sites_otherDonor, \ + discordant_reads_uninformative_fraction_otherDonor, \ + discordant_reads_informative_fraction_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars) + + # here we also need to know : + # how many reads of the desired donor discordant sites could be yielded + + + total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor + concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100 + discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100 + DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor)) + Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor)) + DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor)) + + # We now count the concordant reads that may contribute to particular cell at this cell. + # to do this we take the discordant sites that have been deamed to be concordant with the other donor and quantify the reads thta are concordant. + Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) + expected_vars2 = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) + cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) + cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) + + total_reads_for_discordant_sites_that_are_concordant_with_other_donor,total_dp_for_discordant_sites_that_are_concordant_with_other_donor,total_oth_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_concordance_calc(expected_vars2,cell_vars2) + concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = total_reads_for_discordant_sites_that_are_concordant_with_other_donor - discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor + + try: + donor_cohort = donor_cohorts[donor] + donor_vars = vars_per_donor_gt[donor] + except: + continue + if not donor == donor_gt_match: + # We want to kow how many of these discordant site + + total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)) + # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool. + # the ones that havent, i have to add the reads up for them. + informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor)) + + total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor)) + + + common_vars = list(set(discordant_vars) & set(donor_vars)) + common_var_count = str(len(common_vars)) + donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count + discordant_vars_in_pool.append(donor_cohort_common) + + # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment. + # We do this to investigate the potential of a cell coming from this other donor. + + donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \ + 'gt matched donor':donor == donor_gt_match, \ + 'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \ + 'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \ + 'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \ + 'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \ + 'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \ + 'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \ + 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ + 'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \ + 'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \ + 'total_sites_otherDonor':total_sites_otherDonor, \ + 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ + 'total_reads_otherDonor':total_reads_otherDonor, \ + 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \ + 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ + 'concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor + }) + + discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) + concordant_vars_in_pool_str = (";").join(concordant_vars) + DF = pd.DataFrame(donor_table_of_concordances) + + Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values) + Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] + + Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values) + Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0] + Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values) + Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values) + + return [{ + 'cell1':cell1, + 'donor_gt_match':donor_gt_match, + 'Nr_Concordant':Nr_Concordant, + 'Nr_Discordant':Nr_Discordant, + 'Nr_Relaxed_concordant':Nr_Relaxed_concordant, + 'true_discordant_count':true_discordant_count, + 'relaxed_concordant_informative_count':relaxed_concordant_informative_count, + 'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count, + 'true_discordant_informative_count':true_discordant_informative_count, + 'true_discordant_uninformative_count':true_discordant_uninformative_count, + 'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites, + 'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive, + 'Nr_donor_distinct_sites':Nr_donor_distinct_sites, + 'count':count, + 'discordant_sites':discordant_sites, + 'total_sites':total_sites, + 'informative_sites':informative_sites, + 'uninformative_sites':uninformative_sites, + 'total_reads':total_reads, + 'total_reads_informative':total_reads_informative, + 'total_reads_uninformative':total_reads_uninformative, + 'discordant_reads':discordant_reads, + 'discordant_reads_informative':discordant_reads_informative, + 'discordant_reads_uninformative':discordant_reads_uninformative, + 'Discordant_sites_in_pool': discordant_vars, + 'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors, + 'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance, + 'Concordant_Site_Identities':concordant_vars_in_pool_str, + 'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance, + 'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors, + 'Total_sites_other_donor':Total_sites_other_donor, + 'Total_reads_other_donor':Total_reads_other_donor, + 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}", + 'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}", + 'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \ + 'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites + }, donor_table_of_concordances] + + +class VCF_Loader: + + def __init__(self, vcf_file, biallelic_only=True, + sparse=False, format_list=['GT']): + self.vcf_file = vcf_file + self.load_sample = True + self.biallelic_only = biallelic_only + self.sparse = sparse + self.record_dict={} + self.reset() + self.format_list = format_list + self.exclusive_donor_variants = {} + self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update + self.last_count=-1 + self.reset_c() + + def reset_c(self): + self.record_times=0 + + def reset(self): + self.exclusive_donor_variants ={} + + def myfunc(self): + print(f"Hello my name is {self.biallelic_only}" ) + + def load_sample_mp(self,line,obs_ids,count,format_list): + ''' + takes VCF lines and extracts all format fields for those where GT !='.' + ''' + list_val = line.rstrip().split("\t") #[:5] #:8 + idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called + if remove_AG: + if list_val[3] == 'A' and list_val[4] == 'G':#remove A>G + pass + elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C + pass + + if len(list_val[3]) > 1 or len(list_val[4]) > 1: + # CURRENTLY DEALS ONLY WITH BIALELIC + print(f'{idx} var not bialelic') + pass + else: + list_val2 = list_val[9:] + obs = pd.DataFrame(obs_ids) + lv = pd.DataFrame(list_val2) + lv_proc =lv[0].str.split(':').str[idx] + gt_exists = lv_proc[lv_proc != '.'] + idx2 = gt_exists.index + obs_with_gt = obs.loc[idx2.values] + obs_with_gt = list(obs_with_gt[0].values) + list_val_with_gt = lv.loc[idx2.values] + list_val_with_gt = list(list_val_with_gt[0].values) + random.seed(count) + c = list(zip(obs_with_gt, list_val_with_gt)) + random.shuffle(c) + obs_with_gt, list_val_with_gt = zip(*c) + # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count]) + + return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + + + def append_results(self,result): + # exclusive_donor_variants + obs_with_gt= result[0] + list_val_with_gt= result[1] + idx = result[2] + list_val = result[3] + count = result[4] + format_list = result[5]#list of required format fields + #get indexes of required format fields (apart from GT which has already been taken care of) + additional_field_idxs = [] + for fmt in format_list: + if not fmt == 'GT': + idx_addn = find(list_val[8].split(':'), fmt)[0] + additional_field_idxs.append(idx_addn) + # print(additional_field_idxs) + # exit(0) + + count11=0 + # r = random.random() + # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer. + # every 500 itterations we push the data to a dictionary, later we combine these together. + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.exclusive_donor_variants,count) + self.reset() + self.reset_c() + + for ob_id in obs_with_gt: + donor_loc_in_list = count11 + alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx] + #append any additional format fields to alleles + if len(additional_field_idxs) > 0: + for idx_addnl in additional_field_idxs: + fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl] + alleles = alleles + '_' + fmt_val + + if not alleles.startswith('.'): + ids = "_".join([list_val[x] for x in [0, 1, 3, 4]]) + donor_var = f"{ids}_{alleles}" + while ob_id in self.curently_pushing: + time.sleep(r*0.01) + self.curently_pushing.append(ob_id) + try: + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + except: + self.exclusive_donor_variants[ob_id]=set() + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + self.curently_pushing.remove(ob_id) + # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1'] + count11+=1 + + def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class + to_export = exclusive_donor_variants + for val1 in record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + try: + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + except: + to_export[k1]=set() + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + os.remove(val1) + return to_export + + + def load_VCF_batch_paralel(self): + """ + Load whole VCF file by utilising multiple cores to speed up loading of large cell files + ------------------- + Initially designed to load VCF from cellSNP output, requiring + 1) all variants have the same format list; + 2) a line starting with "#CHROM", with sample ids. + If these two requirements are satisfied, this function also supports general + VCF files, e.g., genotype for multiple samples. + + Note, it may take a large memory, please filter the VCF with bcftools first. + """ + + vcf_file = self.vcf_file + biallelic_only = self.biallelic_only + load_sample= self.load_sample + sparse = self.sparse + format_list= self.format_list + pool = mp.Pool(cpus) + + + import time + if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz": + infile = gzip.open(vcf_file, "rb") + is_gzip = True + else: + infile = open(vcf_file, "r") + is_gzip = False + + FixedINFO = {} + contig_lines = [] + comment_lines = [] + var_ids, obs_ids, obs_dat = [], [], [] + count=0 #57077 + for line in infile: + count+=1 + # if count>10000: + # break + if is_gzip: + line = line.decode('utf-8') + if line.startswith("#"): + if line.startswith("##contig="): + contig_lines.append(line.rstrip()) + if line.startswith("#CHROM"): + if load_sample: + obs_ids = line.rstrip().split("\t")[9:] + for ob_id in obs_ids: + self.exclusive_donor_variants[ob_id]=set() + key_ids = line[1:].rstrip().split("\t")[:8] + for _key in key_ids: + FixedINFO[_key] = [] + else: + comment_lines.append(line.rstrip()) + else: + pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results) + del line + self.last_count=count + pool.close() + pool.join() + + output = self.combine_written_files(self.exclusive_donor_variants,self.record_dict) + + return output + +"""Run CLI.""" + +def get_options(): + ''' + Get options from the command line + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__)) + parser.add_argument('--cpus', action='store', required=True, type=int) + parser.add_argument('--cell_vcf', action='store', required=True) + parser.add_argument('--cell_assignments', action='store', required=True) + parser.add_argument('--donor_assignments', action='store', required=True) + parser.add_argument('--gt_match_vcf', action='store', required=True) + parser.add_argument('--expected_vcf', action='store', required=True) + parser.add_argument('--informative_sites', action='store', required=True) + parser.add_argument('--uninformative_sites', action='store', required=True) + parser.add_argument('--outfile', action='store', required=True) + parser.add_argument('--debug', action='store_true') + parser.add_argument('--remove_AG', action='store_true') + args = parser.parse_args() + + return args + + +def get_sites_from_tsv(sites_file): + """ + get sites frm a tsv file where cols are chrom, pos, id, ref, alt + assumes no multiallelics + """ + sites = set() + with open(sites_file, 'r') as f: + lines = f.readlines() + for l in lines: + linedata = l.split('\t') + var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) + sites.add(var) + return sites + + +def find(lst, a): + return [i for i, x in enumerate(lst) if x==a ] +def norm_genotypes(expected_vars): + expected_vars = pd.DataFrame(expected_vars) + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + +def donor_exclusive_sites(exclusive_don_variants2): + # Here we generate a function for determining the sites that are donor exclusive + donor_distinct_sites = {} + for col1 in exclusive_don_variants2.keys(): + comparisons =[] + to_compare = [] + for col2 in exclusive_don_variants2.keys(): + if col1==col2: + # we set this as the unique entry + # print('1') + to_compare = set(exclusive_don_variants2[col2]) + else: + # We combine all the variants in one list + comparisons+=list(exclusive_don_variants2[col2]) + # print('2') + # print('comparison') + comparisons_all = set(comparisons) + comparisons_all_norm = norm_genotypes(comparisons_all) + comparisons_all=set(comparisons_all_norm['combo']) + + to_compare = set(to_compare) + to_compare_norm = norm_genotypes(to_compare) + to_compare=set(to_compare_norm['combo']) + # Make sure we account for hap types - phased/unphased + distinct_donor_sites = to_compare - comparisons_all + donor_distinct_sites[col1]=distinct_donor_sites + # Perform the distinct set function. + return donor_distinct_sites + + + + +if __name__ == "__main__": + + options = get_options() + cpus = options.cpus + outfile = options.outfile + cell_vcf=options.cell_vcf + donor_assignments=options.donor_assignments + gt_match_vcf=options.gt_match_vcf + expected_vcf=options.expected_vcf + cell_assignments=options.cell_assignments + informative_sites_file = options.informative_sites + uninformative_sites_file = options.uninformative_sites + + informative_sites = get_sites_from_tsv(informative_sites_file) + uninformative_sites = get_sites_from_tsv(uninformative_sites_file) + + exclusive_donor_variants = {} #This is where results are populated when mp process i used. + curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update + All_Results={} + cell_concordance_table = {} + + donor_assignments_table = pd.read_csv(donor_assignments) + cell_assignments_table = pd.read_csv(cell_assignments,sep='\t') + remove_AG = options.remove_AG + + if options.debug: + with open('tmp_GT_Expected_variants.pkl', 'rb') as f: + GT_Expected_variants = pickle.load(f) + with open('tmp_GT_Matched_variants.pkl', 'rb') as f: + GT_Matched_variants = pickle.load(f) + with open('tmp_exclusive_cell_variants.pkl', 'rb') as f: + exclusive_cell_variants = pickle.load(f) + with open('tmp_donor_distinct_sites.pkl', 'rb') as f: + donor_distinct_sites = pickle.load(f) + with open('tmp_exclusive_don_variants.pkl', 'rb') as f: + exclusive_don_variants = pickle.load(f) + else: + print('---Loading genotype VCF----') + if (os.path.exists(gt_match_vcf)): + loader2 = VCF_Loader(gt_match_vcf, biallelic_only=True, + sparse=False, format_list=['GT']) + GT_Matched_variants = loader2.load_VCF_batch_paralel() + del loader2 + else: + GT_Matched_variants = {} + + with open(f'tmp_GT_Matched_variants.pkl', 'wb') as f: + pickle.dump(GT_Matched_variants, f) + + print('---Loading cell VCF----') + loader1 = VCF_Loader(cell_vcf, biallelic_only=True, + sparse=False, format_list=['GT', 'DP', 'AD', 'OTH']) + exclusive_cell_variants = loader1.load_VCF_batch_paralel() + del loader1 + with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f: + pickle.dump(exclusive_cell_variants, f) + + print('---Loading expected VCF----') + loader3 = VCF_Loader(expected_vcf, biallelic_only=True, + sparse=False, format_list=['GT']) + GT_Expected_variants = loader3.load_VCF_batch_paralel() + del loader3 + + with open(f'tmp_GT_Expected_variants.pkl', 'wb') as f: + pickle.dump(GT_Expected_variants, f) + + print('---Variant files loaded----') + + exclusive_don_variants = GT_Expected_variants.keys() + content = [x for x in exclusive_don_variants if not x.startswith('donor')] + GT_Expected_variants = {key: GT_Expected_variants[key] for key in content} + + exclusive_don_variants = GT_Matched_variants.keys() + content = [x for x in exclusive_don_variants if not x.startswith('donor')] + GT_Matched_variants = {key: GT_Matched_variants[key] for key in content} + + exclusive_don_variants = GT_Expected_variants + for key in GT_Matched_variants.keys(): + if key in exclusive_don_variants.keys(): + _='' + else: + exclusive_don_variants[key]=GT_Matched_variants[key] + + with open(f'tmp_exclusive_don_variants.pkl', 'wb') as f: + pickle.dump(exclusive_don_variants, f) + donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants) + with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f: + pickle.dump(donor_distinct_sites, f) + + cell_concordance_table = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites, informative_sites, uninformative_sites).conc_table() + + + result = pd.DataFrame(cell_concordance_table).T + try: + site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']] + result.drop(columns=['Concordant_Site_Identities','Discordant_Site_Identities'],inplace=True) + site_identities.to_csv(f"site_identities_{outfile}",sep='\t') + except: + _='sample_hasnt_matched_any_gt --- most likely too little cells assigned' + result.to_csv(outfile,sep='\t') + + print('Processing Done') \ No newline at end of file diff --git a/bin/concordance_calculations_donor_exclusive_read_level.py b/bin/concordance_calculations_donor_exclusive_read_level.py index 3b336c3c..5497e7e3 100755 --- a/bin/concordance_calculations_donor_exclusive_read_level.py +++ b/bin/concordance_calculations_donor_exclusive_read_level.py @@ -18,323 +18,317 @@ class Concordances: - def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): - self.reset() - self.donor_assignments_table=donor_assignments_table - self.cell_assignments_table=cell_assignments_table - self.exclusive_don_variants=exclusive_don_variants - self.exclusive_cell_variants=exclusive_cell_variants - self.donor_distinct_sites=donor_distinct_sites - self.informative_sites = informative_sites - self.uninformative_sites = uninformative_sites - self.record_dict={} - - def norm_genotypes(self,expected_vars): - expected_vars = pd.DataFrame(expected_vars) - if len(expected_vars) > 0: - split_str=expected_vars[0].str.split("_") - expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] - expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] - expected_vars['vars'] = split_str.str[4] - expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) - expected_vars = expected_vars[expected_vars['vars']!='./.'] - expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' - expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] - return expected_vars - - def reset(self): - self.cell_concordance_table ={} - - # def get_sites_from_tsv(self, sites_file): - # """ - # get sites frm a tsv file where cols are chrom, pos, id, ref, alt - # assumes no multiallelics - # """ - # sites = set() - # with open(sites_file, 'r') as f: - # lines = f.readlines() - # for l in lines: - # linedata = l.split('\t') - # var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) - # sites.add(var) - # return sites - - - def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): - ''' - take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant - sites and relaxed concordant sites - 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype - 2) if you have a 0/0 you can not get a 1/1 or 0/1 - 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 - So - each obversed cellsnp allele must be in the array SNP gtype - ''' - true_discordant = 0 - relaxed_concordant = 0 - relaxed_concordant_informative = 0 - true_discordant_uninformative = 0 - - for i in range(0, len(snp_gtypes)): - discordant = False - snp_data = snp_gtypes[i].split('_') - cellsnp_data = cellsnp_gtypes[i].split('_') - - # the below will no longer work due to differing length of input strings - # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] - # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] - - - snp_alleles = [snp_data[4][0], snp_data[4][2]] - cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] - - snp_alleles_set = set(snp_alleles) - cellsnp_alleles_set = set(cellsnp_alleles) - - snp_var = ('_').join(snp_data[0:4]) - cellsnp_var = ('_').join(cellsnp_data[0:4]) - - if not cellsnp_var == snp_var: - print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) - exit(1) - else: - for allele in cellsnp_alleles_set: - if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant - discordant = True - - if discordant == True: - true_discordant+=1 - if snp_var in self.uninformative_sites: - true_discordant_uninformative+=1 - else: - relaxed_concordant+=1 - if snp_var in self.informative_sites: - relaxed_concordant_informative+=1 - - return true_discordant, relaxed_concordant, relaxed_concordant_informative, true_discordant_uninformative - - - def read_condordance(self, expected_vars, cell_vars): - ''' - get read level concordance using DP, AD and OTH format fields - ##FORMAT= - ##FORMAT= - ##FORMAT= - ''' - if not len(expected_vars) == len(cell_vars): - print("length mismatch between expected vars and cell vars") - exit(1) + def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): + self.reset() + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites + self.record_dict={} + + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + def reset(self): + self.cell_concordance_table ={} + + # def get_sites_from_tsv(self, sites_file): + # """ + # get sites frm a tsv file where cols are chrom, pos, id, ref, alt + # assumes no multiallelics + # """ + # sites = set() + # with open(sites_file, 'r') as f: + # lines = f.readlines() + # for l in lines: + # linedata = l.split('\t') + # var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) + # sites.add(var) + # return sites + + + def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + true_discordant = 0 + relaxed_concordant = 0 + relaxed_concordant_informative = 0 + true_discordant_uninformative = 0 + + for i in range(0, len(snp_gtypes)): + discordant = False + snp_data = snp_gtypes[i].split('_') + cellsnp_data = cellsnp_gtypes[i].split('_') + + # the below will no longer work due to differing length of input strings + # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] + # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] - total_sites = len(expected_vars) - #add cols for DP, AD< OTH - cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) - cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) - cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) - total_dp = cell_vars['DP'].sum() - total_oth = cell_vars['OTH'].sum() - total_reads = total_dp + total_oth - - # expected genotype 0/0 - expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] - hom_ref_sites = set(expected_hom_ref['ids']) - cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] - ad_hom_ref = cell_vars2['AD'].sum() - oth_hom_ref = cell_vars2['OTH'].sum() - discordant_hom_ref = ad_hom_ref + oth_hom_ref - - # expected genotype 0/1 or 1/0 - hets = ['0/1', '1/0'] - expected_het = expected_vars[expected_vars['vars'].isin(hets)] - het_sites = set(expected_het['ids']) - cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] - discordant_het = cell_vars3['OTH'].sum() - - # expected genotype 1/1 - expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] - hom_alt_sites = set(expected_hom_alt['ids']) - cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] - # DP + OTH - AD - ad_hom_alt = cell_vars4['AD'].sum() - dp_hom_alt = cell_vars4['DP'].sum() - oth_hom_alt = cell_vars4['OTH'].sum() - discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt - - discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt - - return total_sites, total_reads, discordant_reads - - def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): - # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. - # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. - # Author: M.Ozols + snp_alleles = [snp_data[4][0], snp_data[4][2]] + cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] + + snp_alleles_set = set(snp_alleles) + cellsnp_alleles_set = set(cellsnp_alleles) - cell_vars_norm = self.norm_genotypes(cell_vars) - - if len(cell_vars_norm) > 0: - Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)] - # print(cell_vars_norm) - # print(expected_vars2) - # print(cell_vars2) - # exit(0) - Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) - Discodrant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) - disc = pd.DataFrame(Discodrant_sites,columns=['combo_x']) - df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') - disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') - disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] - disc_sites = ';'.join(disc2['expected_retrieved']) - #find truly discordant sites - true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) - #find discordant reads - total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2) + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) + + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) + exit(1) else: - Total_Overlappin_sites = set() - Concordant_Sites = set() - Discodrant_sites = set() - disc_sites = '' - true_discordant_count = 0 - relaxed_concordant_count = 0 - total_sites = 0 - total_reads = 0 - discordant_reads = 0 - - return Concordant_Sites, Discodrant_sites, Total_Overlappin_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads - + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + true_discordant+=1 + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + else: + relaxed_concordant+=1 + if snp_var in self.informative_sites: + relaxed_concordant_informative+=1 + + return true_discordant, relaxed_concordant, relaxed_concordant_informative, true_discordant_uninformative + + + def read_condordance(self, expected_vars, cell_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) + + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + + return total_sites, total_reads, discordant_reads + - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - with open(f'tmp_{id}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}.pkl' + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols - def append_results_cell_concordances(self,result): - count=result[11] - try: - percent_concordant = result[2]/(result[3]+result[2])*100 - except: - percent_concordant = 0 - - try: - percent_discordant = result[3]/(result[3]+result[2])*100 - except: - percent_discordant = 0 + cell_vars_norm = self.norm_genotypes(cell_vars) + + if len(cell_vars_norm) > 0: + Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)] + + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites = ';'.join(disc2['expected_retrieved']) + #find truly discordant sites + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) + #find discordant reads + total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2) + else: + Total_Overlappin_sites = set() + Concordant_Sites = set() + Discordant_sites = set() + disc_sites = '' + true_discordant_count = 0 + relaxed_concordant_count = 0 + total_sites = 0 + total_reads = 0 + discordant_reads = 0 + + return Concordant_Sites, Discordant_sites, Total_Overlappin_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + def append_results_cell_concordances(self,result): + count=result[11] + try: + percent_concordant = result[2]/(result[3]+result[2])*100 + except: + percent_concordant = 0 + + try: + percent_discordant = result[3]/(result[3]+result[2])*100 + except: + percent_discordant = 0 + + try: + percent_relaxed_concordant = result[4]/(result[4]+result[5])*100 + except: + percent_relaxed_concordant = 0 + + try: + percent_strict_discordant = result[5]/(result[4]+result[5])*100 + except: + percent_strict_discordant = 0 + + try: + read_discordance = result[15]/result[13] + except: + read_discordance = 0 + + print(count) + self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], + 'GT 2':result[1], + 'Nr_Concordant':result[2], + 'Nr_Discordant':result[3], + 'Nr_Relaxed_concordant':result[4], + 'Nr_strict_discordant':result[5], + 'Percent Concordant':percent_concordant, + 'Percent Discordant':percent_discordant, + 'Percent_relaxed_concordant': percent_relaxed_concordant, + 'Percent_strict_discordant': percent_strict_discordant, + 'Nr_concordant_informative': result[6], + 'Nr_discordant_uninformative': result[7], + 'NrTotal_Overlapping_sites_between_two_genotypes':result[8], + 'Nr_donor_distinct_sites_within_pool_individuals':result[10], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[9], + 'Discordant_Site_Identities':result[12], + 'Total_sites': result[13], + 'Total_reads': result[14], + 'Discordant_reads': result[15], + 'Discordant_reads_by_n_sites': read_discordance + } + + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.cell_concordance_table,count) + self.reset() + _="" + + def combine_written_files(self):#this one is for concordance class + to_export = self.cell_concordance_table + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + to_export[k1]=loaded_dict[k1] + os.remove(val1) + return to_export + + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants + + pool = mp.Pool(cpus) + count = 0 + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + if (donor_gt_match=='NONE'): + continue + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) try: - percent_relaxed_concordant = result[4]/(result[4]+result[5])*100 + expected_vars = exclusive_don_variants[donor_gt_match] except: - percent_relaxed_concordant = 0 + _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances' + continue + expected_vars_norm = self.norm_genotypes(expected_vars) try: - percent_strict_discordant = result[5]/(result[4]+result[5])*100 - except: - percent_strict_discordant = 0 - - try: - read_discordance = result[15]/result[13] + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] except: - read_discordance = 0 - - print(count) - self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], - 'GT 2':result[1], - 'Nr_Concordant':result[2], - 'Nr_Discordant':result[3], - 'Nr_Relaxed_concordant':result[4], - 'Nr_strict_discordant':result[5], - 'Percent Concordant':percent_concordant, - 'Percent Discordant':percent_discordant, - 'Percent_relaxed_concordant': percent_relaxed_concordant, - 'Percent_strict_discordant': percent_strict_discordant, - 'Nr_concordant_informative': result[6], - 'Nr_discordant_uninformative': result[7], - 'NrTotal_Overlapping_sites_between_two_genotypes':result[8], - 'Nr_donor_distinct_sites_within_pool_individuals':result[10], - 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[9], - 'Discordant_Site_Identities':result[12], - 'Total_sites': result[13], - 'Total_reads': result[14], - 'Discordant_reads': result[15], - 'Discordant_reads_by_n_sites': read_discordance - } + continue - if (count % 200 == 0): - print(f'recording and resetting memory {count}') - # self.record_dict[count]=self.exclusive_donor_variants - self.set_results(self.cell_concordance_table,count) - self.reset() - _="" - - def combine_written_files(self):#this one is for concordance class - to_export = self.cell_concordance_table - for val1 in self.record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - for k1 in loaded_dict.keys(): - to_export[k1]=loaded_dict[k1] - os.remove(val1) - return to_export - - - def conc_table(self): - donor_assignments_table=self.donor_assignments_table - cell_assignments_table=self.cell_assignments_table - exclusive_don_variants=self.exclusive_don_variants - exclusive_cell_variants= self.exclusive_cell_variants - - pool = mp.Pool(cpus) - count = 0 - for i,row1 in donor_assignments_table.iterrows(): - donor_in_question = row1['donor_query'] - donor_gt_match = row1['donor_gt'] - if (donor_gt_match=='NONE'): - continue - Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) - try: - expected_vars = exclusive_don_variants[donor_gt_match] - except: - _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances' - continue + for cell1 in Cells_to_keep_pre: + count+=1 + cell_vars = exclusive_cell_variants[cell1] + self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} + # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) + result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count) + self.append_results_cell_concordances(result1) - expected_vars_norm = self.norm_genotypes(expected_vars) - try: - # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. - dds = self.donor_distinct_sites[donor_gt_match] - except: - continue - - for cell1 in Cells_to_keep_pre: - count+=1 - # if count>800: - # break - cell_vars = exclusive_cell_variants[cell1] - # cell_vars_dp = exclusive_cell_variants_dp[cell1] - - self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} - # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) - result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count) - self.append_results_cell_concordances(result1) - - pool.close() - pool.join() - output = self.combine_written_files() - return output + pool.close() + pool.join() + output = self.combine_written_files() + return output + + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): + Nr_donor_distinct_sites = len(dds) + Concordant_Sites, Discordant_sites, Total_Overlappin_sites,discordant_sites,cell_vars_norm, Nr_strict_discordant, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) - def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): - Nr_donor_distinct_sites = len(dds) - Concordant_Sites, Discodrant_sites, Total_Overlappin_sites,discordant_sites,cell_vars_norm, Nr_strict_discordant, relaxed_concordant_count, relaxed_concordant_informative_count, true_discordant_uninformative_count, total_sites, total_reads, discordant_reads = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) - Nr_Concordant = len(Concordant_Sites) - Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count - Nr_Discordant = len(Discodrant_sites) - Nr_Total_Overlapping_sites = len(Total_Overlappin_sites) - Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) - Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) - - return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, - Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads] + Nr_Concordant = len(Concordant_Sites) + Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlappin_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) + Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + + return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, + Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads] class VCF_Loader: diff --git a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py index 6a066d85..f859605a 100755 --- a/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py +++ b/bin/concordance_calculations_donor_exclusive_read_level_noA2G.py @@ -18,387 +18,383 @@ class Concordances: - def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): - self.reset() - self.donor_assignments_table=donor_assignments_table - self.cell_assignments_table=cell_assignments_table - self.exclusive_don_variants=exclusive_don_variants - self.exclusive_cell_variants=exclusive_cell_variants - self.donor_distinct_sites=donor_distinct_sites - self.informative_sites = informative_sites - self.uninformative_sites = uninformative_sites - self.record_dict={} - - def norm_genotypes(self,expected_vars): - expected_vars = pd.DataFrame(expected_vars) - if len(expected_vars) > 0: - split_str=expected_vars[0].str.split("_") - expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] - expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] - expected_vars['vars'] = split_str.str[4] - expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) - expected_vars = expected_vars[expected_vars['vars']!='./.'] - expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' - expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] - return expected_vars - - def reset(self): - self.cell_concordance_table ={} - - # def get_sites_from_tsv(self, sites_file): - # """ - # get sites frm a tsv file where cols are chrom, pos, id, ref, alt - # assumes no multiallelics - # """ - # sites = set() - # with open(sites_file, 'r') as f: - # lines = f.readlines() - # for l in lines: - # linedata = l.split('\t') - # var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) - # sites.add(var) - # return sites - - - def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): - ''' - take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant - sites and relaxed concordant sites - 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype - 2) if you have a 0/0 you can not get a 1/1 or 0/1 - 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 - So - each obversed cellsnp allele must be in the array SNP gtype - ''' - true_discordant = 0 - relaxed_concordant = 0 - relaxed_concordant_informative = 0 - relaxed_concordant_uninformative = 0 - true_discordant_informative = 0 - true_discordant_uninformative = 0 - - for i in range(0, len(snp_gtypes)): - discordant = False - snp_data = snp_gtypes[i].split('_') - cellsnp_data = cellsnp_gtypes[i].split('_') - - # the below will no longer work due to differing length of input strings - # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] - # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] - - - snp_alleles = [snp_data[4][0], snp_data[4][2]] - cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] - - snp_alleles_set = set(snp_alleles) - cellsnp_alleles_set = set(cellsnp_alleles) - - snp_var = ('_').join(snp_data[0:4]) - cellsnp_var = ('_').join(cellsnp_data[0:4]) - - if not cellsnp_var == snp_var: - print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) - exit(1) - else: - for allele in cellsnp_alleles_set: - if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant - discordant = True - - if discordant == True: - true_discordant+=1 - if snp_var in self.uninformative_sites: - true_discordant_uninformative+=1 - elif snp_var in self.informative_sites: - true_discordant_informative+=1 - else: - relaxed_concordant+=1 - if snp_var in self.uninformative_sites: - relaxed_concordant_uninformative+=1 - elif snp_var in self.informative_sites: - relaxed_concordant_informative+=1 - - return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative - - - def read_condordance(self, expected_vars, cell_vars): - ''' - get read level concordance using DP, AD and OTH format fields - ##FORMAT= - ##FORMAT= - ##FORMAT= - ''' - if not len(expected_vars) == len(cell_vars): - print("length mismatch between expected vars and cell vars") - exit(1) + def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): + self.reset() + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites + self.record_dict={} - total_sites = len(expected_vars) - #add cols for DP, AD< OTH - cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) - cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) - cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) - #split to informative and uninformative sites - mask_i = cell_vars['ids'].isin(self.informative_sites) - cell_vars_informative = cell_vars[mask_i] - mask_u = cell_vars['ids'].isin(self.uninformative_sites) - cell_vars_uninformative = cell_vars[mask_u] - informative_sites = len(cell_vars_informative) - uninformative_sites = len(cell_vars_uninformative) - - total_dp = cell_vars['DP'].sum() - total_oth = cell_vars['OTH'].sum() - total_reads = total_dp + total_oth - total_dp_inf = cell_vars_informative['DP'].sum() - total_oth_inf = cell_vars_informative['OTH'].sum() - total_reads_informative = total_dp_inf + total_oth_inf - total_dp_uninf = cell_vars_uninformative['DP'].sum() - total_oth_uninf = cell_vars_uninformative['OTH'].sum() - total_reads_uninformative = total_dp_uninf + total_oth_uninf - - # expected genotype 0/0 - expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] - hom_ref_sites = set(expected_hom_ref['ids']) - cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] - cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)] - cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)] - ad_hom_ref = cell_vars2['AD'].sum() - oth_hom_ref = cell_vars2['OTH'].sum() - discordant_hom_ref = ad_hom_ref + oth_hom_ref - ad_hom_ref_inf = cell_vars_inf_2['AD'].sum() - oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() - discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf - ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum() - oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() - discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf - - # expected genotype 0/1 or 1/0 - hets = ['0/1', '1/0'] - expected_het = expected_vars[expected_vars['vars'].isin(hets)] - het_sites = set(expected_het['ids']) - cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] - cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)] - cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)] - discordant_het = cell_vars3['OTH'].sum() - discordant_het_informative = cell_vars_inf_3['OTH'].sum() - discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum() - - # expected genotype 1/1 - expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] - hom_alt_sites = set(expected_hom_alt['ids']) - cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] - cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)] - cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)] - # DP + OTH - AD - ad_hom_alt = cell_vars4['AD'].sum() - dp_hom_alt = cell_vars4['DP'].sum() - oth_hom_alt = cell_vars4['OTH'].sum() - discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt - ad_hom_alt_inf = cell_vars_inf_4['AD'].sum() - dp_hom_alt_inf = cell_vars_inf_4['DP'].sum() - oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum() - discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf - ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum() - dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum() - oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum() - discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf - - discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt - discordant_reads_informative = discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative - discordant_reads_uninformative = discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative - - return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative - + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + + def reset(self): + self.cell_concordance_table ={} + + # def get_sites_from_tsv(self, sites_file): + # """ + # get sites frm a tsv file where cols are chrom, pos, id, ref, alt + # assumes no multiallelics + # """ + # sites = set() + # with open(sites_file, 'r') as f: + # lines = f.readlines() + # for l in lines: + # linedata = l.split('\t') + # var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) + # sites.add(var) + # return sites + + + def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + true_discordant = 0 + relaxed_concordant = 0 + relaxed_concordant_informative = 0 + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 + true_discordant_uninformative = 0 + + for i in range(0, len(snp_gtypes)): + discordant = False + snp_data = snp_gtypes[i].split('_') + cellsnp_data = cellsnp_gtypes[i].split('_') + + # the below will no longer work due to differing length of input strings + # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] + # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] - def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): - # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. - # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. - # Author: M.Ozols + + snp_alleles = [snp_data[4][0], snp_data[4][2]] + cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] + + snp_alleles_set = set(snp_alleles) + cellsnp_alleles_set = set(cellsnp_alleles) - cell_vars_norm = self.norm_genotypes(cell_vars) - - if len(cell_vars_norm) > 0: - Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] - # print(cell_vars_norm) - # print(expected_vars2) - # print(cell_vars2) - # exit(0) - Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) - Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) - disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) - df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') - disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') - disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] - disc_sites = ';'.join(disc2['expected_retrieved']) - #find truly discordant sites - true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) - #find discordant reads - total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative = self.read_condordance(expected_vars2, cell_vars2) + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) + + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) + exit(1) else: - Total_Overlapping_sites = set() - Concordant_Sites = set() - Discordant_sites = set() - disc_sites = '' - true_discordant_count = 0 - relaxed_concordant_count = 0 - total_sites = 0 - - discordant_reads = 0 - - return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative - + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + true_discordant+=1 + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + elif snp_var in self.informative_sites: + true_discordant_informative+=1 + else: + relaxed_concordant+=1 + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + elif snp_var in self.informative_sites: + relaxed_concordant_informative+=1 + + return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - with open(f'tmp_{id}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}.pkl' + + def read_condordance(self, expected_vars, cell_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) + + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + #split to informative and uninformative sites + mask_i = cell_vars['ids'].isin(self.informative_sites) + cell_vars_informative = cell_vars[mask_i] + mask_u = cell_vars['ids'].isin(self.uninformative_sites) + cell_vars_uninformative = cell_vars[mask_u] + informative_sites = len(cell_vars_informative) + uninformative_sites = len(cell_vars_uninformative) + + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + total_dp_inf = cell_vars_informative['DP'].sum() + total_oth_inf = cell_vars_informative['OTH'].sum() + total_reads_informative = total_dp_inf + total_oth_inf + total_dp_uninf = cell_vars_uninformative['DP'].sum() + total_oth_uninf = cell_vars_uninformative['OTH'].sum() + total_reads_uninformative = total_dp_uninf + total_oth_uninf + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)] + cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + ad_hom_ref_inf = cell_vars_inf_2['AD'].sum() + oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() + discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf + ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum() + oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() + discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)] + cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + discordant_het_informative = cell_vars_inf_3['OTH'].sum() + discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)] + cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + ad_hom_alt_inf = cell_vars_inf_4['AD'].sum() + dp_hom_alt_inf = cell_vars_inf_4['DP'].sum() + oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum() + discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf + ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum() + dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum() + oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum() + discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf + + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + discordant_reads_informative = discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative + discordant_reads_uninformative = discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative + + return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative + + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols - def append_results_cell_concordances(self,result): - count=result[13] - try: - percent_concordant = result[2]/(result[3]+result[2])*100 - except: - percent_concordant = 0 - - try: - percent_discordant = result[3]/(result[3]+result[2])*100 - except: - percent_discordant = 0 + cell_vars_norm = self.norm_genotypes(cell_vars) + + if len(cell_vars_norm) > 0: + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites = ';'.join(disc2['expected_retrieved']) + #find truly discordant sites + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) + #find discordant reads + total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative = self.read_condordance(expected_vars2, cell_vars2) + else: + Total_Overlapping_sites = set() + Concordant_Sites = set() + Discordant_sites = set() + disc_sites = '' + true_discordant_count = 0 + relaxed_concordant_count = 0 + total_sites = 0 + + discordant_reads = 0 - try: - percent_relaxed_concordant = result[4]/(result[4]+result[5])*100 - except: - percent_relaxed_concordant = 0 - - try: - percent_strict_discordant = result[5]/(result[4]+result[5])*100 - except: - percent_strict_discordant = 0 + return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative + - try: - read_discordance = result[21]/result[15] - except: - read_discordance = 0 - - donor = result[1] - cohort = 'UNKNOWN' - donor_split = donor.split("_") - if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): - cohort = 'UKB' - elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): - cohort = 'ELGH' - - print(count) - self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], - 'GT 2':result[1], - 'cohort': cohort, - 'Nr_Concordant':result[2], - 'Nr_Discordant':result[3], - 'Nr_Relaxed_concordant':result[4], - 'Nr_strict_discordant':result[5], - 'Percent Concordant':percent_concordant, - 'Percent Discordant':percent_discordant, - 'Percent_relaxed_concordant': percent_relaxed_concordant, - 'Percent_strict_discordant': percent_strict_discordant, - 'Nr_concordant_informative': result[6], - 'Nr_concordant_uninformative': result[7], - 'Nr_discordant_informative': result[8], - 'Nr_discordant_uninformative': result[9], - 'NrTotal_Overlapping_sites_between_two_genotypes':result[10], - 'Nr_donor_distinct_sites_within_pool_individuals':result[12], - 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11], - 'Discordant_Site_Identities':result[14], - 'Total_sites': result[15], - 'Total_informative_sites': result[16], - 'Total_uninformative_sites': result[17], - 'Total_reads': result[18], - 'Total_reads_informative': result[19], - 'Total_reads_uninformative': result[20], - 'Discordant_reads': result[21], - 'Discordant_reads_informtive': result[22], - 'Discordant_reads_uninformtive': result[23], - 'Discordant_reads_by_n_sites': read_discordance - } - - if (count % 200 == 0): - print(f'recording and resetting memory {count}') - # self.record_dict[count]=self.exclusive_donor_variants - self.set_results(self.cell_concordance_table,count) - self.reset() - _="" + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + def append_results_cell_concordances(self,result): + count=result[13] + try: + percent_concordant = result[2]/(result[3]+result[2])*100 + except: + percent_concordant = 0 - def combine_written_files(self):#this one is for concordance class - to_export = self.cell_concordance_table - for val1 in self.record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - for k1 in loaded_dict.keys(): - to_export[k1]=loaded_dict[k1] - os.remove(val1) - return to_export + try: + percent_discordant = result[3]/(result[3]+result[2])*100 + except: + percent_discordant = 0 + + try: + percent_relaxed_concordant = result[4]/(result[4]+result[5])*100 + except: + percent_relaxed_concordant = 0 + try: + percent_strict_discordant = result[5]/(result[4]+result[5])*100 + except: + percent_strict_discordant = 0 + + try: + read_discordance = result[21]/result[15] + except: + read_discordance = 0 + + donor = result[1] + cohort = 'UNKNOWN' + donor_split = donor.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + + print(count) + self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], + 'GT 2':result[1], + 'cohort': cohort, + 'Nr_Concordant':result[2], + 'Nr_Discordant':result[3], + 'Nr_Relaxed_concordant':result[4], + 'Nr_strict_discordant':result[5], + 'Percent Concordant':percent_concordant, + 'Percent Discordant':percent_discordant, + 'Percent_relaxed_concordant': percent_relaxed_concordant, + 'Percent_strict_discordant': percent_strict_discordant, + 'Nr_concordant_informative': result[6], + 'Nr_concordant_uninformative': result[7], + 'Nr_discordant_informative': result[8], + 'Nr_discordant_uninformative': result[9], + 'NrTotal_Overlapping_sites_between_two_genotypes':result[10], + 'Nr_donor_distinct_sites_within_pool_individuals':result[12], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11], + 'Discordant_Site_Identities':result[14], + 'Total_sites': result[15], + 'Total_informative_sites': result[16], + 'Total_uninformative_sites': result[17], + 'Total_reads': result[18], + 'Total_reads_informative': result[19], + 'Total_reads_uninformative': result[20], + 'Discordant_reads': result[21], + 'Discordant_reads_informtive': result[22], + 'Discordant_reads_uninformtive': result[23], + 'Discordant_reads_by_n_sites': read_discordance + } + + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.cell_concordance_table,count) + self.reset() + _="" + + def combine_written_files(self):#this one is for concordance class + to_export = self.cell_concordance_table + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + to_export[k1]=loaded_dict[k1] + os.remove(val1) + return to_export + + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants - def conc_table(self): - donor_assignments_table=self.donor_assignments_table - cell_assignments_table=self.cell_assignments_table - exclusive_don_variants=self.exclusive_don_variants - exclusive_cell_variants= self.exclusive_cell_variants + pool = mp.Pool(cpus) + count = 0 + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + if (donor_gt_match=='NONE'): + continue + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) + try: + expected_vars = exclusive_don_variants[donor_gt_match] + except: + _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances' + continue + expected_vars_norm = self.norm_genotypes(expected_vars) + try: + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] + except: + continue - pool = mp.Pool(cpus) - count = 0 - for i,row1 in donor_assignments_table.iterrows(): - donor_in_question = row1['donor_query'] - donor_gt_match = row1['donor_gt'] - if (donor_gt_match=='NONE'): - continue - Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) - try: - expected_vars = exclusive_don_variants[donor_gt_match] - except: - _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances' - continue - expected_vars_norm = self.norm_genotypes(expected_vars) - try: - # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. - dds = self.donor_distinct_sites[donor_gt_match] - except: - continue + for cell1 in Cells_to_keep_pre: + count+=1 + # if count>800: + # break + cell_vars = exclusive_cell_variants[cell1] + # cell_vars_dp = exclusive_cell_variants_dp[cell1] + + self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} + # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) + result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count) + self.append_results_cell_concordances(result1) - for cell1 in Cells_to_keep_pre: - count+=1 - # if count>800: - # break - cell_vars = exclusive_cell_variants[cell1] - # cell_vars_dp = exclusive_cell_variants_dp[cell1] - - self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} - # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) - result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count) - self.append_results_cell_concordances(result1) - - pool.close() - pool.join() - output = self.combine_written_files() - return output - - def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): - Nr_donor_distinct_sites = len(dds) - Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) - Nr_Concordant = len(Concordant_Sites) - Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count - Nr_Discordant = len(Discordant_sites) - Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) - Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) - Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) - - return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, true_discordant_count, relaxed_concordant_informative_count, - relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, - Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, - uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative] - - + pool.close() + pool.join() + output = self.combine_written_files() + return output + + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): + Nr_donor_distinct_sites = len(dds) + Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + Nr_Concordant = len(Concordant_Sites) + Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) + Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + + return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, true_discordant_count, relaxed_concordant_informative_count, + relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, + Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, + uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative] + + class VCF_Loader: def __init__(self, vcf_file, biallelic_only=True, @@ -436,7 +432,6 @@ def load_sample_mp(self,line,obs_ids,count,format_list): elif list_val[3] == 'A' and list_val[4] == 'G':#remove A>G pass elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C - pass else: list_val2 = list_val[9:] @@ -787,5 +782,5 @@ def donor_exclusive_sites(exclusive_don_variants2): if len(result)>0: result.to_csv(outfile,sep='\t') - print('Processing Done') + print('Processing Done') \ No newline at end of file diff --git a/bin/concordance_calculations_subsample_informative.py b/bin/concordance_calculations_subsample_informative.py index 4aae389a..b2605971 100755 --- a/bin/concordance_calculations_subsample_informative.py +++ b/bin/concordance_calculations_subsample_informative.py @@ -18,506 +18,507 @@ class Concordances: - def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): - self.reset() - self.donor_assignments_table=donor_assignments_table - self.cell_assignments_table=cell_assignments_table - self.exclusive_don_variants=exclusive_don_variants - self.exclusive_cell_variants=exclusive_cell_variants - self.donor_distinct_sites=donor_distinct_sites - self.informative_sites = informative_sites - self.uninformative_sites = uninformative_sites - self.record_dict={} - - def norm_genotypes(self,expected_vars): - expected_vars = pd.DataFrame(expected_vars) - if len(expected_vars) > 0: - split_str=expected_vars[0].str.split("_") - expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] - expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] - expected_vars['vars'] = split_str.str[4] - expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) - expected_vars = expected_vars[expected_vars['vars']!='./.'] - expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' - expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] - return expected_vars - - def reset(self): - self.cell_concordance_table ={} - - # def get_sites_from_tsv(self, sites_file): - # """ - # get sites frm a tsv file where cols are chrom, pos, id, ref, alt - # assumes no multiallelics - # """ - # sites = set() - # with open(sites_file, 'r') as f: - # lines = f.readlines() - # for l in lines: - # linedata = l.split('\t') - # var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) - # sites.add(var) - # return sites - - - def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): - ''' - take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant - sites and relaxed concordant sites - 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype - 2) if you have a 0/0 you can not get a 1/1 or 0/1 - 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 - So - each obversed cellsnp allele must be in the array SNP gtype - ''' - true_discordant = 0 - relaxed_concordant = 0 - relaxed_concordant_informative = 0 - relaxed_concordant_uninformative = 0 - true_discordant_informative = 0 - true_discordant_uninformative = 0 - subset_informative_concordant = 0 - subset_informative_discordant = 0 - - #print(self.uninformative_sites) - #print(self.informative_sites) - - #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both - #then filter to informative and uninformative. If uninformative >0 then create a subset of informative - # with the same number of vars (at random) - split_snp_gts=snp_gtypes.str.split("_") - snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) - - split_cellsnp_gts=cellsnp_gtypes.str.split("_") - cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) - - shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) - - shared_informative = shared_gts.intersection(self.informative_sites) - shared_uninformative = shared_gts.intersection(self.uninformative_sites) - # print("shared informative " + str(len(shared_informative))) - # print("shared uninformative " + str(len(shared_uninformative))) - - #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these - #are the sites used for concordance - self.informative_covered = len(shared_informative) - self.uninformative_covered = len(shared_uninformative) + def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): + self.reset() + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites + self.record_dict={} - if len(shared_uninformative) > 0: - #print(len(shared_uninformative)) - # print(len(shared_informative)) - if len(shared_uninformative) <= len(shared_informative): - informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) - else: - informative_subset = set()#if there are more shared uninformative than shared informative we will not subset - # print(informative_subset) - # exit(0) - else: - informative_subset = set() + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars + def reset(self): + self.cell_concordance_table ={} + + # def get_sites_from_tsv(self, sites_file): + # """ + # get sites frm a tsv file where cols are chrom, pos, id, ref, alt + # assumes no multiallelics + # """ + # sites = set() + # with open(sites_file, 'r') as f: + # lines = f.readlines() + # for l in lines: + # linedata = l.split('\t') + # var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) + # sites.add(var) + # return sites + + + def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + true_discordant = 0 + relaxed_concordant = 0 + relaxed_concordant_informative = 0 + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 + true_discordant_uninformative = 0 + subset_informative_concordant = 0 + subset_informative_discordant = 0 + + #print(self.uninformative_sites) + #print(self.informative_sites) + + #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both + #then filter to informative and uninformative. If uninformative >0 then create a subset of informative + # with the same number of vars (at random) + split_snp_gts=snp_gtypes.str.split("_") + snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) + + split_cellsnp_gts=cellsnp_gtypes.str.split("_") + cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) + + shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) + + shared_informative = shared_gts.intersection(self.informative_sites) + shared_uninformative = shared_gts.intersection(self.uninformative_sites) + # print("shared informative " + str(len(shared_informative))) + # print("shared uninformative " + str(len(shared_uninformative))) + + #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these + #are the sites used for concordance + self.informative_covered = len(shared_informative) + self.uninformative_covered = len(shared_uninformative) + + if len(shared_uninformative) > 0: + #print(len(shared_uninformative)) + # print(len(shared_informative)) + if len(shared_uninformative) <= len(shared_informative): + informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) + else: + informative_subset = set()#if there are more shared uninformative than shared informative we will not subset # print(informative_subset) - self.informative_subset = informative_subset + # exit(0) + else: + informative_subset = set() - snp_gtypes_set = set(snp_gtypes) - snp_gtypes_set = sorted(snp_gtypes_set) + # print(informative_subset) + self.informative_subset = informative_subset - cellsnp_gtypes_set = set(cellsnp_gtypes) - cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) + snp_gtypes_set = set(snp_gtypes) + snp_gtypes_set = sorted(snp_gtypes_set) - #for i in range(0, len(snp_gtypes)): - for i in range(0, len(snp_gtypes_set)): - discordant = False - # snp_data = snp_gtypes[i].split('_') - # cellsnp_data = cellsnp_gtypes[i].split('_') - snp_data = snp_gtypes_set[i].split('_') - cellsnp_data = cellsnp_gtypes_set[i].split('_') + cellsnp_gtypes_set = set(cellsnp_gtypes) + cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) - # the below will no longer work due to differing length of input strings - # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] - # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] + #for i in range(0, len(snp_gtypes)): + for i in range(0, len(snp_gtypes_set)): + discordant = False + # snp_data = snp_gtypes[i].split('_') + # cellsnp_data = cellsnp_gtypes[i].split('_') + snp_data = snp_gtypes_set[i].split('_') + cellsnp_data = cellsnp_gtypes_set[i].split('_') + # the below will no longer work due to differing length of input strings + # snp_alleles = [snp_gtypes[i][-3], snp_gtypes[i][-1]] + # cellsnp_alleles = [cellsnp_gtypes[i][-3], cellsnp_gtypes[i][-1]] - snp_alleles = [snp_data[4][0], snp_data[4][2]] - cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] - snp_alleles_set = set(snp_alleles) - cellsnp_alleles_set = set(cellsnp_alleles) - - snp_var = ('_').join(snp_data[0:4]) - cellsnp_var = ('_').join(cellsnp_data[0:4]) + snp_alleles = [snp_data[4][0], snp_data[4][2]] + cellsnp_alleles = [cellsnp_data[4][0], cellsnp_data[4][2]] - if not cellsnp_var == snp_var: - print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) - exit(1) - else: - for allele in cellsnp_alleles_set: - if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant - discordant = True - - if discordant == True: - true_discordant+=1 - if snp_var in self.uninformative_sites: - true_discordant_uninformative+=1 - elif snp_var in self.informative_sites: - true_discordant_informative+=1 - else: - relaxed_concordant+=1 - if snp_var in self.uninformative_sites: - relaxed_concordant_uninformative+=1 - elif snp_var in self.informative_sites: - relaxed_concordant_informative+=1 + snp_alleles_set = set(snp_alleles) + cellsnp_alleles_set = set(cellsnp_alleles) + + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) - - if len(shared_uninformative) > 0: - if snp_var in informative_subset: - if discordant == True: - subset_informative_discordant+=1 - else: - subset_informative_concordant+=1 - - # print("conc inf " + str(relaxed_concordant_informative)) - # print("disc inf " + str(true_discordant_informative)) - - return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative, subset_informative_concordant, subset_informative_discordant - - - def read_condordance(self, expected_vars, cell_vars): - ''' - get read level concordance using DP, AD and OTH format fields - ##FORMAT= - ##FORMAT= - ##FORMAT= - ''' - # print(len(expected_vars)) - # print(len(cell_vars)) - - if not len(expected_vars) == len(cell_vars): - print("length mismatch between expected vars and cell vars") + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) exit(1) + else: + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + true_discordant+=1 + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + elif snp_var in self.informative_sites: + true_discordant_informative+=1 + else: + relaxed_concordant+=1 + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + elif snp_var in self.informative_sites: + relaxed_concordant_informative+=1 - total_sites = len(expected_vars) - #add cols for DP, AD< OTH - cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) - cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) - cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) - #split to informative and uninformative sites - mask_i = cell_vars['ids'].isin(self.informative_sites) - cell_vars_informative = cell_vars[mask_i] - mask_u = cell_vars['ids'].isin(self.uninformative_sites) - cell_vars_uninformative = cell_vars[mask_u] - informative_sites = len(cell_vars_informative) - uninformative_sites = len(cell_vars_uninformative) - mask_s = cell_vars['ids'].isin(self.informative_subset) - cell_vars_informative_subset = cell_vars[mask_s] - informative_subset_sites = len(cell_vars_informative_subset) - # print("Informative sites " + str(len(self.informative_sites))) - # print("uninformative sites " + str(len(self.uninformative_sites))) - # print("informative sites in cell vars " + str(len(cell_vars_informative))) - # print("uninformative sites in cell vars " + str(len(cell_vars_uninformative))) - # print("Informative subset " + str(informative_subset_sites)) - # print(cell_vars_informative_subset) - # exit(0) + + if len(shared_uninformative) > 0: + if snp_var in informative_subset: + if discordant == True: + subset_informative_discordant+=1 + else: + subset_informative_concordant+=1 - total_dp = cell_vars['DP'].sum() - total_oth = cell_vars['OTH'].sum() - total_reads = total_dp + total_oth - total_dp_inf = cell_vars_informative['DP'].sum() - total_oth_inf = cell_vars_informative['OTH'].sum() - total_reads_informative = total_dp_inf + total_oth_inf - total_dp_uninf = cell_vars_uninformative['DP'].sum() - total_oth_uninf = cell_vars_uninformative['OTH'].sum() - total_reads_uninformative = total_dp_uninf + total_oth_uninf - total_dp_inf_subset = cell_vars_informative_subset['DP'].sum() - total_oth_inf_subset = cell_vars_informative_subset['OTH'].sum() - total_reads_informative_subset = total_dp_inf_subset + total_oth_inf_subset - - # expected genotype 0/0 - expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] - hom_ref_sites = set(expected_hom_ref['ids']) - cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] - cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)] - cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)] - cell_vars_inf_subset_2 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_ref_sites)] - ad_hom_ref = cell_vars2['AD'].sum() - oth_hom_ref = cell_vars2['OTH'].sum() - discordant_hom_ref = ad_hom_ref + oth_hom_ref - ad_hom_ref_inf = cell_vars_inf_2['AD'].sum() - oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() - discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf - ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum() - oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() - discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf - ad_hom_ref_inf_subset = cell_vars_inf_subset_2['AD'].sum() - oth_hom_ref_inf_subset = cell_vars_inf_subset_2['OTH'].sum() - discordant_hom_ref_informative_subset = ad_hom_ref_inf_subset + oth_hom_ref_inf_subset - - # expected genotype 0/1 or 1/0 - hets = ['0/1', '1/0'] - expected_het = expected_vars[expected_vars['vars'].isin(hets)] - het_sites = set(expected_het['ids']) - cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] - cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)] - cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)] - cell_vars_inf_subset_3 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(het_sites)] - discordant_het = cell_vars3['OTH'].sum() - discordant_het_informative = cell_vars_inf_3['OTH'].sum() - discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum() - discordant_het_informative_subset = cell_vars_inf_subset_3['OTH'].sum() - - # expected genotype 1/1 - expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] - hom_alt_sites = set(expected_hom_alt['ids']) - cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] - cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)] - cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)] - cell_vars_inf_subset_4 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_alt_sites)] - # DP + OTH - AD - ad_hom_alt = cell_vars4['AD'].sum() - dp_hom_alt = cell_vars4['DP'].sum() - oth_hom_alt = cell_vars4['OTH'].sum() - discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt - ad_hom_alt_inf = cell_vars_inf_4['AD'].sum() - dp_hom_alt_inf = cell_vars_inf_4['DP'].sum() - oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum() - discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf - ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum() - dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum() - oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum() - discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf - ad_hom_alt_inf_subset = cell_vars_inf_subset_4['AD'].sum() - dp_hom_alt_inf_subset = cell_vars_inf_subset_4['DP'].sum() - oth_hom_alt_inf_subset = cell_vars_inf_subset_4['OTH'].sum() - discordant_hom_alt_informative_subset = (dp_hom_alt_inf_subset + oth_hom_alt_inf_subset) - ad_hom_alt_inf_subset - - discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt - discordant_reads_informative = discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative - discordant_reads_uninformative = discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative - discordant_reads_informative_subset = discordant_hom_ref_informative_subset + discordant_het_informative_subset + discordant_hom_alt_informative_subset - - return total_sites, self.informative_covered, self.uninformative_covered, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset - - + # print("conc inf " + str(relaxed_concordant_informative)) + # print("disc inf " + str(true_discordant_informative)) - def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): - # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. - # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. - # Author: M.Ozols - - cell_vars_norm = self.norm_genotypes(cell_vars) - - if len(cell_vars_norm) > 0: - Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] - # print(cell_vars_norm) - # print(expected_vars2) - # print(cell_vars2) - # exit(0) - Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) - Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) - disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) - df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') - disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') - # print(len(disc2)) - # exit(0) - disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] - disc_sites = ';'.join(disc2['expected_retrieved']) - #find truly discordant sites - #true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) - true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(expected_vars2[0], cell_vars2[0]) - #find discordant reads - total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset = self.read_condordance(expected_vars2, cell_vars2) - else: - Total_Overlapping_sites = set() - Concordant_Sites = set() - Discordant_sites = set() - disc_sites = '' - true_discordant_count = 0 - relaxed_concordant_count = 0 - total_sites = 0 - discordant_reads = 0 - - informative_subset_sites = 0 - subset_informative_sites_concordant_count = 0 - subset_informative_sites_discordant_count = 0 - total_reads_informative_subset = 0 - discordant_reads_informative_subset = 0 - relaxed_concordant_informative_count = 0 - relaxed_concordant_uninformative_count = 0 - true_discordant_informative_count = 0 - true_discordant_uninformative_count = 0 - total_reads = 0 - total_reads_informative = 0 - total_reads_uninformative = 0 - discordant_reads = 0 - discordant_reads_informative = 0 - discordant_reads_uninformative = 0 - informative_sites = 0 - uninformative_sites = 0 - - #print(total_sites, informative_sites, uninformative_sites, relaxed_concordant_informative_count, true_discordant_informative_count, self.informative_covered, self.uninformative_covered) - #exit(0) + return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative, subset_informative_concordant, subset_informative_discordant - return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset - - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - with open(f'tmp_{id}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}.pkl' + def read_condordance(self, expected_vars, cell_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + # print(len(expected_vars)) + # print(len(cell_vars)) + + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) + + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + #split to informative and uninformative sites + mask_i = cell_vars['ids'].isin(self.informative_sites) + cell_vars_informative = cell_vars[mask_i] + mask_u = cell_vars['ids'].isin(self.uninformative_sites) + cell_vars_uninformative = cell_vars[mask_u] + informative_sites = len(cell_vars_informative) + uninformative_sites = len(cell_vars_uninformative) + mask_s = cell_vars['ids'].isin(self.informative_subset) + cell_vars_informative_subset = cell_vars[mask_s] + informative_subset_sites = len(cell_vars_informative_subset) + # print("Informative sites " + str(len(self.informative_sites))) + # print("uninformative sites " + str(len(self.uninformative_sites))) + # print("informative sites in cell vars " + str(len(cell_vars_informative))) + # print("uninformative sites in cell vars " + str(len(cell_vars_uninformative))) + # print("Informative subset " + str(informative_subset_sites)) + # print(cell_vars_informative_subset) + # exit(0) + + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + total_dp_inf = cell_vars_informative['DP'].sum() + total_oth_inf = cell_vars_informative['OTH'].sum() + total_reads_informative = total_dp_inf + total_oth_inf + total_dp_uninf = cell_vars_uninformative['DP'].sum() + total_oth_uninf = cell_vars_uninformative['OTH'].sum() + total_reads_uninformative = total_dp_uninf + total_oth_uninf + total_dp_inf_subset = cell_vars_informative_subset['DP'].sum() + total_oth_inf_subset = cell_vars_informative_subset['OTH'].sum() + total_reads_informative_subset = total_dp_inf_subset + total_oth_inf_subset + + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)] + cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)] + cell_vars_inf_subset_2 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + ad_hom_ref_inf = cell_vars_inf_2['AD'].sum() + oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() + discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf + ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum() + oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() + discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf + ad_hom_ref_inf_subset = cell_vars_inf_subset_2['AD'].sum() + oth_hom_ref_inf_subset = cell_vars_inf_subset_2['OTH'].sum() + discordant_hom_ref_informative_subset = ad_hom_ref_inf_subset + oth_hom_ref_inf_subset + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)] + cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)] + cell_vars_inf_subset_3 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + discordant_het_informative = cell_vars_inf_3['OTH'].sum() + discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum() + discordant_het_informative_subset = cell_vars_inf_subset_3['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)] + cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)] + cell_vars_inf_subset_4 = cell_vars_informative_subset[cell_vars_informative_subset['ids'].isin(hom_alt_sites)] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + ad_hom_alt_inf = cell_vars_inf_4['AD'].sum() + dp_hom_alt_inf = cell_vars_inf_4['DP'].sum() + oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum() + discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf + ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum() + dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum() + oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum() + discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf + ad_hom_alt_inf_subset = cell_vars_inf_subset_4['AD'].sum() + dp_hom_alt_inf_subset = cell_vars_inf_subset_4['DP'].sum() + oth_hom_alt_inf_subset = cell_vars_inf_subset_4['OTH'].sum() + discordant_hom_alt_informative_subset = (dp_hom_alt_inf_subset + oth_hom_alt_inf_subset) - ad_hom_alt_inf_subset + + + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + discordant_reads_informative = discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative + discordant_reads_uninformative = discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative + discordant_reads_informative_subset = discordant_hom_ref_informative_subset + discordant_het_informative_subset + discordant_hom_alt_informative_subset + + return total_sites, self.informative_covered, self.uninformative_covered, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset + + + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols - def append_results_cell_concordances(self,result): - count=result[13] - try: - percent_concordant = result[2]/(result[3]+result[2])*100 - except: - percent_concordant = 0 + cell_vars_norm = self.norm_genotypes(cell_vars) + + if len(cell_vars_norm) > 0: + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + # print(cell_vars_norm) + # print(expected_vars2) + # print(cell_vars2) + # exit(0) + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + # print(len(disc2)) + # exit(0) + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites = ';'.join(disc2['expected_retrieved']) + #find truly discordant sites + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count = self.get_strict_discordance(expected_vars2[0], cell_vars2[0]) + #find discordant reads + total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative, informative_subset_sites, total_reads_informative_subset, discordant_reads_informative_subset = self.read_condordance(expected_vars2, cell_vars2) + else: + Total_Overlapping_sites = set() + Concordant_Sites = set() + Discordant_sites = set() + disc_sites = '' + true_discordant_count = 0 + relaxed_concordant_count = 0 + total_sites = 0 + discordant_reads = 0 + + informative_subset_sites = 0 + subset_informative_sites_concordant_count = 0 + subset_informative_sites_discordant_count = 0 + total_reads_informative_subset = 0 + discordant_reads_informative_subset = 0 + relaxed_concordant_informative_count = 0 + relaxed_concordant_uninformative_count = 0 + true_discordant_informative_count = 0 + true_discordant_uninformative_count = 0 + total_reads = 0 + total_reads_informative = 0 + total_reads_uninformative = 0 + discordant_reads = 0 + discordant_reads_informative = 0 + discordant_reads_uninformative = 0 + informative_sites = 0 + uninformative_sites = 0 - try: - percent_discordant = result[3]/(result[3]+result[2])*100 - except: - percent_discordant = 0 + #print(total_sites, informative_sites, uninformative_sites, relaxed_concordant_informative_count, true_discordant_informative_count, self.informative_covered, self.uninformative_covered) + #exit(0) - try: - percent_relaxed_concordant = result[4]/(result[4]+result[5])*100 - except: - percent_relaxed_concordant = 0 - - try: - percent_strict_discordant = result[5]/(result[4]+result[5])*100 - except: - percent_strict_discordant = 0 + return Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites,cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset + - try: - read_discordance = result[21]/result[15] - except: - read_discordance = 0 - - donor = result[1] - cohort = 'UNKNOWN' - donor_split = donor.split("_") - if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): - cohort = 'UKB' - elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): - cohort = 'ELGH' - - print(count) - self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], - 'GT 2':result[1], - 'cohort': cohort, - 'Nr_Concordant':result[2], - 'Nr_Discordant':result[3], - 'Nr_Relaxed_concordant':result[4], - 'Nr_strict_discordant':result[5], - 'Percent Concordant':percent_concordant, - 'Percent Discordant':percent_discordant, - 'Percent_relaxed_concordant': percent_relaxed_concordant, - 'Percent_strict_discordant': percent_strict_discordant, - 'Nr_concordant_informative': result[6], - 'Nr_concordant_uninformative': result[7], - 'Nr_discordant_informative': result[8], - 'Nr_discordant_uninformative': result[9], - 'NrTotal_Overlapping_sites_between_two_genotypes':result[10], - 'Nr_donor_distinct_sites_within_pool_individuals':result[12], - 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11], - 'Discordant_Site_Identities':result[14], - 'Total_sites': result[15], - 'Total_informative_sites': result[16], - 'Total_uninformative_sites': result[17], - 'Total_reads': result[18], - 'Total_reads_informative': result[19], - 'Total_reads_uninformative': result[20], - 'Discordant_reads': result[21], - 'Discordant_reads_informtive': result[22], - 'Discordant_reads_uninformtive': result[23], - 'Discordant_reads_by_n_sites': read_discordance, - 'informative_subset_sites': result[24], - 'subset_informative_sites_concordant_count': result[25], - 'subset_informative_sites_discordant_count': result[26], - 'total_reads_informative_subset': result[27], - 'discordant_reads_informative_subset': result[28] - } - #informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset] - if (count % 200 == 0): - print(f'recording and resetting memory {count}') - # self.record_dict[count]=self.exclusive_donor_variants - self.set_results(self.cell_concordance_table,count) - self.reset() - _="" + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + def append_results_cell_concordances(self,result): + count=result[13] + try: + percent_concordant = result[2]/(result[3]+result[2])*100 + except: + percent_concordant = 0 - def combine_written_files(self):#this one is for concordance class - to_export = self.cell_concordance_table - for val1 in self.record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - for k1 in loaded_dict.keys(): - to_export[k1]=loaded_dict[k1] - os.remove(val1) - return to_export + try: + percent_discordant = result[3]/(result[3]+result[2])*100 + except: + percent_discordant = 0 + + try: + percent_relaxed_concordant = result[4]/(result[4]+result[5])*100 + except: + percent_relaxed_concordant = 0 + try: + percent_strict_discordant = result[5]/(result[4]+result[5])*100 + except: + percent_strict_discordant = 0 + + try: + read_discordance = result[21]/result[15] + except: + read_discordance = 0 + + donor = result[1] + cohort = 'UNKNOWN' + donor_split = donor.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + + print(count) + self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], + 'GT 2':result[1], + 'cohort': cohort, + 'Nr_Concordant':result[2], + 'Nr_Discordant':result[3], + 'Nr_Relaxed_concordant':result[4], + 'Nr_strict_discordant':result[5], + 'Percent Concordant':percent_concordant, + 'Percent Discordant':percent_discordant, + 'Percent_relaxed_concordant': percent_relaxed_concordant, + 'Percent_strict_discordant': percent_strict_discordant, + 'Nr_concordant_informative': result[6], + 'Nr_concordant_uninformative': result[7], + 'Nr_discordant_informative': result[8], + 'Nr_discordant_uninformative': result[9], + 'NrTotal_Overlapping_sites_between_two_genotypes':result[10], + 'Nr_donor_distinct_sites_within_pool_individuals':result[12], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result[11], + 'Discordant_Site_Identities':result[14], + 'Total_sites': result[15], + 'Total_informative_sites': result[16], + 'Total_uninformative_sites': result[17], + 'Total_reads': result[18], + 'Total_reads_informative': result[19], + 'Total_reads_uninformative': result[20], + 'Discordant_reads': result[21], + 'Discordant_reads_informtive': result[22], + 'Discordant_reads_uninformtive': result[23], + 'Discordant_reads_by_n_sites': read_discordance, + 'informative_subset_sites': result[24], + 'subset_informative_sites_concordant_count': result[25], + 'subset_informative_sites_discordant_count': result[26], + 'total_reads_informative_subset': result[27], + 'discordant_reads_informative_subset': result[28] + } +#informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset] + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.cell_concordance_table,count) + self.reset() + _="" + + def combine_written_files(self):#this one is for concordance class + to_export = self.cell_concordance_table + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + to_export[k1]=loaded_dict[k1] + os.remove(val1) + return to_export + + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants - def conc_table(self): - donor_assignments_table=self.donor_assignments_table - cell_assignments_table=self.cell_assignments_table - exclusive_don_variants=self.exclusive_don_variants - exclusive_cell_variants= self.exclusive_cell_variants + pool = mp.Pool(cpus) + count = 0 + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + if (donor_gt_match=='NONE'): + continue + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) + try: + expected_vars = exclusive_don_variants[donor_gt_match] + except: + _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances' + continue + expected_vars_norm = self.norm_genotypes(expected_vars) + try: + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] + except: + continue - pool = mp.Pool(cpus) - count = 0 - for i,row1 in donor_assignments_table.iterrows(): - donor_in_question = row1['donor_query'] - donor_gt_match = row1['donor_gt'] - if (donor_gt_match=='NONE'): - continue - Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) - try: - expected_vars = exclusive_don_variants[donor_gt_match] - except: - _='here we have specifically excluded the donor that has been assigned as it is not expected genotype, because of this we can not calculate the concordances' - continue - expected_vars_norm = self.norm_genotypes(expected_vars) - try: - # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. - dds = self.donor_distinct_sites[donor_gt_match] - except: - continue + for cell1 in Cells_to_keep_pre: + count+=1 + # if count>800: + # break + cell_vars = exclusive_cell_variants[cell1] + # cell_vars_dp = exclusive_cell_variants_dp[cell1] + + self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} + # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) + result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count) + self.append_results_cell_concordances(result1) - for cell1 in Cells_to_keep_pre: - count+=1 - # if count>800: - # break - cell_vars = exclusive_cell_variants[cell1] - # cell_vars_dp = exclusive_cell_variants_dp[cell1] - - self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} - # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) - result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count) - self.append_results_cell_concordances(result1) - - pool.close() - pool.join() - output = self.combine_written_files() - return output - - def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): - Nr_donor_distinct_sites = len(dds) - Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) - Nr_Concordant = len(Concordant_Sites) - #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count - Nr_Discordant = len(Discordant_sites) - Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) - Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) - #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) - - return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,relaxed_concordant_count, true_discordant_count, relaxed_concordant_informative_count, - relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, - Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, - uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, - informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset] - - + pool.close() + pool.join() + output = self.combine_written_files() + return output + + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count): + Nr_donor_distinct_sites = len(dds) + Concordant_Sites, Discordant_sites, Total_Overlapping_sites, disc_sites, cell_vars_norm, true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, total_sites, informative_sites, uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + Nr_Concordant = len(Concordant_Sites) + #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) + #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + + return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,relaxed_concordant_count, true_discordant_count, relaxed_concordant_informative_count, + relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, + Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,disc_sites, total_sites, informative_sites, + uninformative_sites, total_reads, total_reads_informative, total_reads_uninformative, discordant_reads, discordant_reads_informative, discordant_reads_uninformative, + informative_subset_sites, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count, total_reads_informative_subset, discordant_reads_informative_subset] + + class VCF_Loader: def __init__(self, vcf_file, biallelic_only=True, diff --git a/bin/find_discordant_sites_in_other_donors_find_best_donor.py b/bin/find_discordant_sites_in_other_donors_find_best_donor.py index 2cc45b21..becc8733 100755 --- a/bin/find_discordant_sites_in_other_donors_find_best_donor.py +++ b/bin/find_discordant_sites_in_other_donors_find_best_donor.py @@ -221,6 +221,22 @@ def load_VCF_batch_paralel(self): output = self.combine_written_files() return output +def get_options(): + ''' + Get options from the command line + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--cpus', action='store', required=True, type=int) + parser.add_argument('--cell_vcf', action='store', required=True) + parser.add_argument('--cell_assignments', action='store', required=True) + parser.add_argument('--donor_assignments', action='store', required=True) + parser.add_argument('--gt_match_vcf', action='store', required=True) + parser.add_argument('--expected_vcf', action='store', required=True) + parser.add_argument('--outfile', action='store', required=True) + parser.add_argument('--debug', action='store_true') + args = parser.parse_args() + + return args class Concordances: def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites): @@ -232,7 +248,6 @@ def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_ self.donor_distinct_sites=donor_distinct_sites self.record_dict={} - def norm_genotypes(self,expected_vars): expected_vars = pd.DataFrame(expected_vars) if len(expected_vars) > 0: @@ -245,7 +260,6 @@ def norm_genotypes(self,expected_vars): expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] return expected_vars - def reset(self): self.cell_concordance_table ={} @@ -580,8 +594,6 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g def find(lst, a): return [i for i, x in enumerate(lst) if x==a ] - - def norm_genotypes(expected_vars): expected_vars = pd.DataFrame(expected_vars) split_str=expected_vars[0].str.split("_") @@ -625,22 +637,7 @@ def donor_exclusive_sites(exclusive_don_variants2): return donor_distinct_sites -def get_options(): - ''' - Get options from the command line - ''' - parser = argparse.ArgumentParser() - parser.add_argument('--cpus', action='store', required=True, type=int) - parser.add_argument('--cell_vcf', action='store', required=True) - parser.add_argument('--cell_assignments', action='store', required=True) - parser.add_argument('--donor_assignments', action='store', required=True) - parser.add_argument('--gt_match_vcf', action='store', required=True) - parser.add_argument('--expected_vcf', action='store', required=True) - parser.add_argument('--outfile', action='store', required=True) - parser.add_argument('--debug', action='store_true') - args = parser.parse_args() - return args if __name__ == "__main__": @@ -731,13 +728,11 @@ def get_options(): pickle.dump(donor_distinct_sites, f) print('---donor_distinct_sites calculated----') - - + conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites) cell_concordance_table = conc1.conc_table() result = pd.DataFrame(cell_concordance_table).T - # result.to_csv(outfile,sep='\t') try: site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']] result.drop(columns=['Concordant_Site_Identities'],inplace=True) diff --git a/bin/find_discordant_sites_in_other_donors_noA2G.py b/bin/find_discordant_sites_in_other_donors_noA2G.py index b747a749..10de94c8 100755 --- a/bin/find_discordant_sites_in_other_donors_noA2G.py +++ b/bin/find_discordant_sites_in_other_donors_noA2G.py @@ -15,228 +15,34 @@ __date__ = '2023-07-24' __version__ = '0.0.1' import argparse +import sys +import importlib.util import pickle import pandas as pd +import gzip import random import numpy as np +import time import multiprocessing as mp from multiprocessing import Lock +import logging import os import gzip import time - -class VCF_Loader: - - def __init__(self, vcf_file, biallelic_only=True, - sparse=False, format_list=['GT']): - self.vcf_file = vcf_file - self.load_sample = True - self.biallelic_only = biallelic_only - self.sparse = sparse - self.record_dict={} - self.reset() - self.format_list = format_list - self.exclusive_donor_variants = {} - self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update - self.last_count=-1 - self.reset_c() - - def reset_c(self): - self.record_times=0 - - def reset(self): - self.exclusive_donor_variants ={} - - def myfunc(self): - print(f"Hello my name is {self.biallelic_only}" ) - - def load_sample_mp(self,line,obs_ids,count,format_list): - ''' - takes VCF lines and extracts all format fields for those where GT !='.' - ''' - list_val = line.rstrip().split("\t") #[:5] #:8 - idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called - if len(list_val[3]) > 1 or len(list_val[4]) > 1: - # CURRENTLY DEALS ONLY WITH BIALELIC - print(f'{idx} var not bialelic') - elif list_val[3] == 'A' and list_val[4] == 'G':#remove A>G - pass - elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C - pass - else: - list_val2 = list_val[9:] - obs = pd.DataFrame(obs_ids) - lv = pd.DataFrame(list_val2) - lv_proc =lv[0].str.split(':').str[idx] - gt_exists = lv_proc[lv_proc != '.'] - idx2 = gt_exists.index - obs_with_gt = obs.loc[idx2.values] - obs_with_gt = list(obs_with_gt[0].values) - list_val_with_gt = lv.loc[idx2.values] - list_val_with_gt = list(list_val_with_gt[0].values) - random.seed(count) - c = list(zip(obs_with_gt, list_val_with_gt)) - random.shuffle(c) - obs_with_gt, list_val_with_gt = zip(*c) - # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count]) - - return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step - - - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - with open(f'tmp_{id}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}.pkl' - - - def append_results(self,result): - # exclusive_donor_variants - obs_with_gt= result[0] - list_val_with_gt= result[1] - idx = result[2] - list_val = result[3] - count = result[4] - format_list = result[5]#list of required format fields - #get indexes of required format fields (apart from GT which has already been taken care of) - additional_field_idxs = [] - for fmt in format_list: - if not fmt == 'GT': - idx_addn = find(list_val[8].split(':'), fmt)[0] - additional_field_idxs.append(idx_addn) - # print(additional_field_idxs) - # exit(0) - - count11=0 - # r = random.random() - # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer. - # every 500 itterations we push the data to a dictionary, later we combine these together. - if (count % 200 == 0): - print(f'recording and resetting memory {count}') - # self.record_dict[count]=self.exclusive_donor_variants - self.set_results(self.exclusive_donor_variants,count) - self.reset() - self.reset_c() - - for ob_id in obs_with_gt: - donor_loc_in_list = count11 - alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx] - #append any additional format fields to alleles - if len(additional_field_idxs) > 0: - for idx_addnl in additional_field_idxs: - fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl] - alleles = alleles + '_' + fmt_val - - if not alleles.startswith('.'): - ids = "_".join([list_val[x] for x in [0, 1, 3, 4]]) - donor_var = f"{ids}_{alleles}" - while ob_id in self.curently_pushing: - time.sleep(r*0.01) - self.curently_pushing.append(ob_id) - try: - self.exclusive_donor_variants[ob_id].add(donor_var) - self.record_times=self.record_times+1 - except: - self.exclusive_donor_variants[ob_id]=set() - self.exclusive_donor_variants[ob_id].add(donor_var) - self.record_times=self.record_times+1 - self.curently_pushing.remove(ob_id) - # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1'] - count11+=1 - - def combine_written_files(self):#this is for VCF loader class - to_export = self.exclusive_donor_variants - for val1 in self.record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - for k1 in loaded_dict.keys(): - try: - to_export[k1]=to_export[k1].union(loaded_dict[k1]) - except: - to_export[k1]=set() - to_export[k1]=to_export[k1].union(loaded_dict[k1]) - os.remove(val1) - return to_export - - - def load_VCF_batch_paralel(self): - """ - Load whole VCF file by utilising multiple cores to speed up loading of large cell files - ------------------- - Initially designed to load VCF from cellSNP output, requiring - 1) all variants have the same format list; - 2) a line starting with "#CHROM", with sample ids. - If these two requirements are satisfied, this function also supports general - VCF files, e.g., genotype for multiple samples. - - Note, it may take a large memory, please filter the VCF with bcftools first. - """ - - vcf_file = self.vcf_file - biallelic_only = self.biallelic_only - load_sample= self.load_sample - sparse = self.sparse - format_list= self.format_list - pool = mp.Pool(cpus) - - - import time - if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz": - infile = gzip.open(vcf_file, "rb") - is_gzip = True - else: - infile = open(vcf_file, "r") - is_gzip = False - - FixedINFO = {} - contig_lines = [] - comment_lines = [] - var_ids, obs_ids, obs_dat = [], [], [] - count=0 #57077 - for line in infile: - count+=1 - # if count>10000: - # break - if is_gzip: - line = line.decode('utf-8') - if line.startswith("#"): - if line.startswith("##contig="): - contig_lines.append(line.rstrip()) - if line.startswith("#CHROM"): - if load_sample: - obs_ids = line.rstrip().split("\t")[9:] - for ob_id in obs_ids: - self.exclusive_donor_variants[ob_id]=set() - key_ids = line[1:].rstrip().split("\t")[:8] - for _key in key_ids: - FixedINFO[_key] = [] - else: - comment_lines.append(line.rstrip()) - else: - pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results) - del line - self.last_count=count - pool.close() - pool.join() - - output = self.combine_written_files() - return output - - +remove_ag=True class Concordances: - def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites): + def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites,informative_sites, uninformative_sites): self.reset() self.donor_assignments_table=donor_assignments_table self.cell_assignments_table=cell_assignments_table self.exclusive_don_variants=exclusive_don_variants self.exclusive_cell_variants=exclusive_cell_variants self.donor_distinct_sites=donor_distinct_sites + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites self.record_dict={} - def norm_genotypes(self,expected_vars): expected_vars = pd.DataFrame(expected_vars) if len(expected_vars) > 0: @@ -249,7 +55,6 @@ def norm_genotypes(self,expected_vars): expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] return expected_vars - def reset(self): self.cell_concordance_table ={} @@ -271,38 +76,76 @@ def read_condordance(self, expected_vars, cell_vars): cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + #split to informative and uninformative sites + mask_i = cell_vars['ids'].isin(self.informative_sites) + cell_vars_informative = cell_vars[mask_i] + mask_u = cell_vars['ids'].isin(self.uninformative_sites) + cell_vars_uninformative = cell_vars[mask_u] + informative_sites = len(cell_vars_informative) + uninformative_sites = len(cell_vars_uninformative) + total_dp = cell_vars['DP'].sum() total_oth = cell_vars['OTH'].sum() total_reads = total_dp + total_oth + total_dp_inf = cell_vars_informative['DP'].sum() + total_oth_inf = cell_vars_informative['OTH'].sum() + total_reads_informative = total_dp_inf + total_oth_inf + total_dp_uninf = cell_vars_uninformative['DP'].sum() + total_oth_uninf = cell_vars_uninformative['OTH'].sum() + total_reads_uninformative = total_dp_uninf + total_oth_uninf # expected genotype 0/0 expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] hom_ref_sites = set(expected_hom_ref['ids']) cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + cell_vars_inf_2 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_ref_sites)] + cell_vars_uninf_2 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_ref_sites)] ad_hom_ref = cell_vars2['AD'].sum() oth_hom_ref = cell_vars2['OTH'].sum() discordant_hom_ref = ad_hom_ref + oth_hom_ref + ad_hom_ref_inf = cell_vars_inf_2['AD'].sum() + oth_hom_ref_inf = cell_vars_inf_2['OTH'].sum() + discordant_hom_ref_informative = ad_hom_ref_inf + oth_hom_ref_inf + ad_hom_ref_uninf = cell_vars_uninf_2['AD'].sum() + oth_hom_ref_uninf = cell_vars_uninf_2['OTH'].sum() + discordant_hom_ref_uninformative = ad_hom_ref_uninf + oth_hom_ref_uninf # expected genotype 0/1 or 1/0 hets = ['0/1', '1/0'] expected_het = expected_vars[expected_vars['vars'].isin(hets)] het_sites = set(expected_het['ids']) cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + cell_vars_inf_3 = cell_vars_informative[cell_vars_informative['ids'].isin(het_sites)] + cell_vars_uninf_3 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(het_sites)] discordant_het = cell_vars3['OTH'].sum() + discordant_het_informative = cell_vars_inf_3['OTH'].sum() + discordant_het_uninformative = cell_vars_uninf_3['OTH'].sum() # expected genotype 1/1 expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] hom_alt_sites = set(expected_hom_alt['ids']) cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + cell_vars_inf_4 = cell_vars_informative[cell_vars_informative['ids'].isin(hom_alt_sites)] + cell_vars_uninf_4 = cell_vars_uninformative[cell_vars_uninformative['ids'].isin(hom_alt_sites)] # DP + OTH - AD ad_hom_alt = cell_vars4['AD'].sum() dp_hom_alt = cell_vars4['DP'].sum() oth_hom_alt = cell_vars4['OTH'].sum() discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt + ad_hom_alt_inf = cell_vars_inf_4['AD'].sum() + dp_hom_alt_inf = cell_vars_inf_4['DP'].sum() + oth_hom_alt_inf = cell_vars_inf_4['OTH'].sum() + discordant_hom_alt_informative = (dp_hom_alt_inf + oth_hom_alt_inf) - ad_hom_alt_inf + ad_hom_alt_uninf = cell_vars_uninf_4['AD'].sum() + dp_hom_alt_uninf = cell_vars_uninf_4['DP'].sum() + oth_hom_alt_uninf = cell_vars_uninf_4['OTH'].sum() + discordant_hom_alt_uninformative = (dp_hom_alt_uninf + oth_hom_alt_uninf) - ad_hom_alt_uninf discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + discordant_reads_informative = discordant_hom_ref_informative + discordant_het_informative + discordant_hom_alt_informative + discordant_reads_uninformative = discordant_hom_ref_uninformative + discordant_het_uninformative + discordant_hom_alt_uninformative - return total_sites, total_reads, discordant_reads + return total_sites, informative_sites, uninformative_sites, total_reads, discordant_reads, total_reads_informative, discordant_reads_informative, total_reads_uninformative, discordant_reads_uninformative def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): @@ -316,7 +159,12 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): ''' true_discordant = 0 relaxed_concordant = 0 + relaxed_concordant_informative = 0 + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 + true_discordant_uninformative = 0 discordant_vars = [] + concordant_vars = [] for i in range(0, len(snp_gtypes)): discordant = False @@ -348,20 +196,34 @@ def get_strict_discordance(self, snp_gtypes, cellsnp_gtypes): if discordant == True: true_discordant+=1 discordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + elif snp_var in self.informative_sites: + true_discordant_informative+=1 else: relaxed_concordant+=1 + concordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + elif snp_var in self.informative_sites: + relaxed_concordant_informative+=1 + + return true_discordant, relaxed_concordant, relaxed_concordant_informative, relaxed_concordant_uninformative, true_discordant_informative, true_discordant_uninformative,discordant_vars - return true_discordant, relaxed_concordant, discordant_vars - - def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols + cell_vars_norm = self.norm_genotypes(cell_vars) + if len(cell_vars_norm) > 0: - Total_Overlappin_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlappin_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlappin_sites)] + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) - #find sites that may be discordant - this will include hets which are not truly discordant Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') @@ -369,12 +231,12 @@ def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] #disc_sites = ';'.join(disc2['expected_retrieved']) - true_discordant_count, relaxed_concordant_count, discordant_vars = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) + true_discordant_count, relaxed_concordant_count, discordant_vars, concordant_vars = self.get_strict_discordance(disc2['0_y'], disc2['0_x']) total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count #find discordant reads total_sites, total_reads, discordant_reads = self.read_condordance(expected_vars2, cell_vars2) - return total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars + return total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars, concordant_vars def set_results(self,to_set,id): @@ -382,8 +244,8 @@ def set_results(self,to_set,id): with open(f'tmp_{id}.pkl', 'wb') as f: pickle.dump(to_set, f) self.record_dict[id]=f'tmp_{id}.pkl' - - def append_results_cell_concordances(self,result): + + def append_results_cell_concordances(self,result,cell_concordance_table): #[cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, # discordant_reads, discordant_vars,discordant_vars_in_pool_str, count] count=result[10] @@ -398,28 +260,38 @@ def append_results_cell_concordances(self,result): except: read_discordance = 0 - print(count) - self.cell_concordance_table[f'{result[0]} --- {result[1]}'] = {'GT 1':result[0], - 'GT 2':result[1], - 'Cohort': result[2], - 'Nr_Concordant':result[5], - 'Nr_Discordant':result[4], - 'Percent_Discordant':percent_discordant, - 'Total_sites': result[3], - 'Total_reads': result[6], - 'Discordant_reads': result[7], - 'Discordant_reads_by_n_sites': read_discordance, - 'Discordant_sites_in_pool': result[9], - 'Discordant_Site_Identities':(';').join(result[8]) + # print(count) + same_as_asigned_donor = result[12]==result[1] + cell_concordance_table[f'{result[0]} --- {result[1]}'] = { 'GT 1':result[0], + 'GT 2':result[1], + 'Cohort': result[2], + 'Nr_Concordant':result[5], + 'Nr_Discordant':result[4], + 'Percent_Discordant':percent_discordant, + 'Total_sites': result[3], + 'Total_reads': result[6], + 'Discordant_reads': result[7], + 'Discordant_reads_by_n_sites': read_discordance, + 'Discordant_sites_in_pool': result[9], + 'Discordant_Site_Identities':(';').join(result[8]), + 'Lowest_Disconcordance_value_in_all_donors':result[11], + 'Donor_With_Lowest_DisConcordance':result[12], + 'Concordant_Site_Identities':result[13], + 'same_as_asigned_donor':same_as_asigned_donor, + 'Donor_With_Highest_Concordance':result[14], + 'Highest_Concordance_value_in_all_donors':result[15], + 'Total_sites_other_donor':result[16], + 'Total_reads_other_donor':result[17] } - if (count % 200 == 0): - print(f'recording and resetting memory {count}') - # self.record_dict[count]=self.exclusive_donor_variants - self.set_results(self.cell_concordance_table,count) - self.reset() - _="" - + # if (count % 200 == 0): + # print(f'recording and resetting memory {count}') + # # self.record_dict[count]=self.exclusive_donor_variants + # self.set_results(self.cell_concordance_table,count) + # self.reset() + # _="" + return cell_concordance_table + def combine_written_files(self):#this one is for concordance class to_export = self.cell_concordance_table for val1 in self.record_dict.values(): @@ -432,6 +304,27 @@ def combine_written_files(self):#this one is for concordance class os.remove(val1) return to_export + def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm): + donor_concordance_table = {} + for cell1 in Cells_to_keep_pre: + count+=1 + # if count>10: + # break + cell_vars = exclusive_cell_variants[cell1] + # cell_vars_dp = exclusive_cell_variants_dp[cell1] + + # self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} + # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) + result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data) + # if (result1==None): + # _='test' + donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table) + # print('Done') + return donor_concordance_table + + def combine_concordances(self,result): + # print('res') + self.cell_concordance_table = self.cell_concordance_table | result def conc_table(self): donor_assignments_table=self.donor_assignments_table @@ -463,6 +356,15 @@ def conc_table(self): cohort = 'ELGH' donor_cohorts[don_id] = cohort + all_donor_data={} + # here we calvculate all the expected donor datasets + for row1 in exclusive_don_variants.keys(): + # donor_in_question = row1['donor_query'] + donor_gt_match = row1 + expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match] + expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) + all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor + for i,row1 in donor_assignments_table.iterrows(): donor_in_question = row1['donor_query'] donor_gt_match = row1['donor_gt'] @@ -480,36 +382,30 @@ def conc_table(self): dds = self.donor_distinct_sites[donor_gt_match] except: continue - - for cell1 in Cells_to_keep_pre: - count+=1 - # if count>800: - # break - cell_vars = exclusive_cell_variants[cell1] - # cell_vars_dp = exclusive_cell_variants_dp[cell1] - - self.cell_concordance_table[f'{cell1} --- {donor_gt_match}']={} - # pool.apply_async(self.concordance_dable_production, args=([expected_vars_norm,cell_vars,cell1,donor_gt_match,dds,count]),callback=self.append_results_cell_concordances) - result1 = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count) - self.append_results_cell_concordances(result1) + if cpus==1: + result_conc = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm) + self.combine_concordances(result_conc) + else: + pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances) pool.close() pool.join() output = self.combine_written_files() return output + - - def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count): + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data): #Nr_donor_distinct_sites = len(dds) - total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) - #Nr_Concordant = len(Concordant_Sites) - #Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count - #Nr_Discordant = len(Discordant_sites) - #Nr_Total_Overlapping_sites = len(Total_Overlappin_sites) - #Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) - #Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars, concordant_vars = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + Nr_Concordant = len(Concordant_Sites) + Nr_Relaxed_concordant = Nr_Concordant + relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Concordant_Sites))) + Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) #find if the discordant vars are in any of the other donors discordant_vars_in_pool = [] + donor_table_of_concordances = [] for donor in vars_per_donor_gt: if not donor == donor_gt_match: try: @@ -521,17 +417,273 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g common_var_count = str(len(common_vars)) donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count discordant_vars_in_pool.append(donor_cohort_common) + + # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment. + # We do this to investigate the potential of a cell coming from this other donor. + + expected_vars_norm_of_other_donor = all_donor_data[donor] + total_sites_otherDonor, true_discordant_count_otherDonor, total_concordant_sites_otherDonor, total_reads_otherDonor, discordant_reads_otherDonor, discordant_vars_otherDonor, concordant_vars_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars) + concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100 + discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100 + donor_table_of_concordances.append({'donor':donor,'concordant_percent_in_other_donor':concordant_percent_in_other_donor,'discordant_percent_in_other_donor':discordant_percent_in_other_donor,'total_sites_otherDonor':total_sites_otherDonor,'total_reads_otherDonor':total_reads_otherDonor}) + discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) - - return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count] + concordant_vars_in_pool_str = (";").join(concordant_vars) + DF = pd.DataFrame(donor_table_of_concordances) + Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values) + Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] + + Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values) + Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0] + Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values) + Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values) + + return [cell1, donor_gt_match, donor_gt_match_cohort, total_sites, true_discordant_count, total_concordant_sites, total_reads, discordant_reads, discordant_vars,discordant_vars_in_pool_str, count,Lowest_Disconcordance_value_in_all_donors,Donor_With_Lowest_DisConcordance,concordant_vars_in_pool_str,Donor_With_Highest_Concordance,Highest_Concordance_value_in_all_donors,Total_sites_other_donor,Total_reads_other_donor] #return [cell1,donor_gt_match,Nr_Concordant,Nr_Discordant,Nr_Relaxed_concordant, Nr_strict_discordant, relaxed_concordant_informative_count, true_discordant_uninformative_count, Nr_Total_Overlapping_sites, # Number_of_sites_that_are_donor_concordant_and_exclusive, Nr_donor_distinct_sites,count,discordant_sites, total_sites, total_reads, discordant_reads] -def find(lst, a): - return [i for i, x in enumerate(lst) if x==a ] + +class VCF_Loader: + def __init__(self, vcf_file, biallelic_only=True, + sparse=False, format_list=['GT']): + self.vcf_file = vcf_file + self.load_sample = True + self.biallelic_only = biallelic_only + self.sparse = sparse + self.record_dict={} + self.reset() + self.format_list = format_list + self.exclusive_donor_variants = {} + self.curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update + self.last_count=-1 + self.reset_c() + def reset_c(self): + self.record_times=0 + + def reset(self): + self.exclusive_donor_variants ={} + + def myfunc(self): + print(f"Hello my name is {self.biallelic_only}" ) + + def load_sample_mp(self,line,obs_ids,count,format_list): + ''' + takes VCF lines and extracts all format fields for those where GT !='.' + ''' + list_val = line.rstrip().split("\t") #[:5] #:8 + idx = find(list_val[8].split(':'),'GT')[0]#find index of GT field as GT will tell us what variants are called + if len(list_val[3]) > 1 or len(list_val[4]) > 1: + # CURRENTLY DEALS ONLY WITH BIALELIC + print(f'{idx} var not bialelic') + if remove_ag: + if list_val[3] == 'A' and list_val[4] == 'G':#remove A>G + pass + elif list_val[3] == 'T' and list_val[4] == 'C':#also remove T>C + pass + else: + list_val2 = list_val[9:] + obs = pd.DataFrame(obs_ids) + lv = pd.DataFrame(list_val2) + lv_proc =lv[0].str.split(':').str[idx] + gt_exists = lv_proc[lv_proc != '.'] + idx2 = gt_exists.index + obs_with_gt = obs.loc[idx2.values] + obs_with_gt = list(obs_with_gt[0].values) + list_val_with_gt = lv.loc[idx2.values] + list_val_with_gt = list(list_val_with_gt[0].values) + random.seed(count) + c = list(zip(obs_with_gt, list_val_with_gt)) + random.shuffle(c) + obs_with_gt, list_val_with_gt = zip(*c) + # self.append_results([obs_with_gt,list_val_with_gt,idx,list_val,count]) + + return [obs_with_gt,list_val_with_gt,idx,list_val,count,format_list]#add format_list to the return value as we need this for the next step + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + with open(f'tmp_{id}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}.pkl' + + + def append_results(self,result): + # exclusive_donor_variants + obs_with_gt= result[0] + list_val_with_gt= result[1] + idx = result[2] + list_val = result[3] + count = result[4] + format_list = result[5]#list of required format fields + #get indexes of required format fields (apart from GT which has already been taken care of) + additional_field_idxs = [] + for fmt in format_list: + if not fmt == 'GT': + idx_addn = find(list_val[8].split(':'), fmt)[0] + additional_field_idxs.append(idx_addn) + # print(additional_field_idxs) + # exit(0) + + count11=0 + # r = random.random() + # Issue is that this slows down after number of entries is recorded. So recoding takes longer and longer. + # every 500 itterations we push the data to a dictionary, later we combine these together. + if (count % 200 == 0): + print(f'recording and resetting memory {count}') + # self.record_dict[count]=self.exclusive_donor_variants + self.set_results(self.exclusive_donor_variants,count) + self.reset() + self.reset_c() + + for ob_id in obs_with_gt: + donor_loc_in_list = count11 + alleles = list_val_with_gt[donor_loc_in_list].split(':')[idx] + #append any additional format fields to alleles + if len(additional_field_idxs) > 0: + for idx_addnl in additional_field_idxs: + fmt_val = list_val_with_gt[donor_loc_in_list].split(':')[idx_addnl] + alleles = alleles + '_' + fmt_val + + if not alleles.startswith('.'): + ids = "_".join([list_val[x] for x in [0, 1, 3, 4]]) + donor_var = f"{ids}_{alleles}" + while ob_id in self.curently_pushing: + time.sleep(r*0.01) + self.curently_pushing.append(ob_id) + try: + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + except: + self.exclusive_donor_variants[ob_id]=set() + self.exclusive_donor_variants[ob_id].add(donor_var) + self.record_times=self.record_times+1 + self.curently_pushing.remove(ob_id) + # self.exclusive_donor_variants['CTGAAACGTAAGTTCC-1'] + count11+=1 + + def combine_written_files(self):#this is for VCF loader class + to_export = self.exclusive_donor_variants + for val1 in self.record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + try: + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + except: + to_export[k1]=set() + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + os.remove(val1) + return to_export + + + def load_VCF_batch_paralel(self): + """ + Load whole VCF file by utilising multiple cores to speed up loading of large cell files + ------------------- + Initially designed to load VCF from cellSNP output, requiring + 1) all variants have the same format list; + 2) a line starting with "#CHROM", with sample ids. + If these two requirements are satisfied, this function also supports general + VCF files, e.g., genotype for multiple samples. + + Note, it may take a large memory, please filter the VCF with bcftools first. + """ + + vcf_file = self.vcf_file + biallelic_only = self.biallelic_only + load_sample= self.load_sample + sparse = self.sparse + format_list= self.format_list + pool = mp.Pool(cpus) + + + import time + if vcf_file[-3:] == ".gz" or vcf_file[-4:] == ".bgz": + infile = gzip.open(vcf_file, "rb") + is_gzip = True + else: + infile = open(vcf_file, "r") + is_gzip = False + + FixedINFO = {} + contig_lines = [] + comment_lines = [] + var_ids, obs_ids, obs_dat = [], [], [] + count=0 #57077 + for line in infile: + count+=1 + # if count>10000: + # break + if is_gzip: + line = line.decode('utf-8') + if line.startswith("#"): + if line.startswith("##contig="): + contig_lines.append(line.rstrip()) + if line.startswith("#CHROM"): + if load_sample: + obs_ids = line.rstrip().split("\t")[9:] + for ob_id in obs_ids: + self.exclusive_donor_variants[ob_id]=set() + key_ids = line[1:].rstrip().split("\t")[:8] + for _key in key_ids: + FixedINFO[_key] = [] + else: + comment_lines.append(line.rstrip()) + else: + pool.apply_async(self.load_sample_mp, args=([line,obs_ids,count,format_list]),callback=self.append_results) + del line + self.last_count=count + pool.close() + pool.join() + + output = self.combine_written_files() + return output + +"""Run CLI.""" + +def get_options(): + ''' + Get options from the command line + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__)) + parser.add_argument('--cpus', action='store', required=True, type=int) + parser.add_argument('--cell_vcf', action='store', required=True) + parser.add_argument('--cell_assignments', action='store', required=True) + parser.add_argument('--donor_assignments', action='store', required=True) + parser.add_argument('--gt_match_vcf', action='store', required=True) + parser.add_argument('--expected_vcf', action='store', required=True) + parser.add_argument('--informative_sites', action='store', required=True) + parser.add_argument('--uninformative_sites', action='store', required=True) + parser.add_argument('--outfile', action='store', required=True) + parser.add_argument('--debug', action='store_true') + args = parser.parse_args() + + return args + + +def get_sites_from_tsv(sites_file): + """ + get sites frm a tsv file where cols are chrom, pos, id, ref, alt + assumes no multiallelics + """ + sites = set() + with open(sites_file, 'r') as f: + lines = f.readlines() + for l in lines: + linedata = l.split('\t') + var = ('_').join([linedata[0], linedata[1], linedata[3], linedata[4]]) + sites.add(var) + return sites + + +def find(lst, a): + return [i for i, x in enumerate(lst) if x==a ] def norm_genotypes(expected_vars): expected_vars = pd.DataFrame(expected_vars) split_str=expected_vars[0].str.split("_") @@ -575,34 +727,23 @@ def donor_exclusive_sites(exclusive_don_variants2): return donor_distinct_sites -def get_options(): - ''' - Get options from the command line - ''' - parser = argparse.ArgumentParser() - parser.add_argument('--cpus', action='store', required=True, type=int) - parser.add_argument('--cell_vcf', action='store', required=True) - parser.add_argument('--cell_assignments', action='store', required=True) - parser.add_argument('--donor_assignments', action='store', required=True) - parser.add_argument('--gt_match_vcf', action='store', required=True) - parser.add_argument('--expected_vcf', action='store', required=True) - parser.add_argument('--outfile', action='store', required=True) - parser.add_argument('--debug', action='store_true') - args = parser.parse_args() - - return args if __name__ == "__main__": options = get_options() cpus = options.cpus + outfile = options.outfile cell_vcf=options.cell_vcf donor_assignments=options.donor_assignments gt_match_vcf=options.gt_match_vcf expected_vcf=options.expected_vcf cell_assignments=options.cell_assignments - outfile = options.outfile + informative_sites_file = options.informative_sites + uninformative_sites_file = options.uninformative_sites + + informative_sites = get_sites_from_tsv(informative_sites_file) + uninformative_sites = get_sites_from_tsv(uninformative_sites_file) exclusive_donor_variants = {} #This is where results are populated when mp process i used. curently_pushing =[] #this is a lock value to check if rhe curent field is updated so to avaid the race for update @@ -637,16 +778,12 @@ def get_options(): pickle.dump(GT_Matched_variants, f) print('---Loading cell VCF----') - tic = time.perf_counter() loader1 = VCF_Loader(cell_vcf, biallelic_only=True, sparse=False, format_list=['GT', 'DP', 'AD', 'OTH']) exclusive_cell_variants = loader1.load_VCF_batch_paralel() del loader1 - toc = time.perf_counter() - with open(f'tmp_exclusive_cell_variants.pkl', 'wb') as f: pickle.dump(exclusive_cell_variants, f) - print(f"Loading took {toc - tic:0.4f} seconds") print('---Loading expected VCF----') loader3 = VCF_Loader(expected_vcf, biallelic_only=True, @@ -679,13 +816,15 @@ def get_options(): donor_distinct_sites = donor_exclusive_sites(exclusive_don_variants) with open(f'tmp_donor_distinct_sites.pkl', 'wb') as f: pickle.dump(donor_distinct_sites, f) - - print('---donor_distinct_sites calculated----') - - conc1 = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites) - cell_concordance_table = conc1.conc_table() - + cell_concordance_table = Concordances(donor_assignments_table,cell_assignments_table,exclusive_don_variants,exclusive_cell_variants,donor_distinct_sites, informative_sites, uninformative_sites).conc_table() result = pd.DataFrame(cell_concordance_table).T + try: + site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']] + result.drop(columns=['Concordant_Site_Identities'],inplace=True) + site_identities.to_csv(f"site_identities_{outfile}",sep='\t') + except: + _='sample_hasnt_matched_any_gt --- most likely too little cells assigned' result.to_csv(outfile,sep='\t') + print('Processing Done') \ No newline at end of file From 4377f68236a15f2e823e706cf8f628b6bb3d9cd1 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Tue, 14 Nov 2023 18:57:00 +0000 Subject: [PATCH 3/7] all done, lets prduce the metrics --- bin/concordance_calculations.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py index 8d0c41f2..c1b0b309 100644 --- a/bin/concordance_calculations.py +++ b/bin/concordance_calculations.py @@ -12,7 +12,7 @@ # list of donors in the pool, how many of the discordant sites are found in the donor, cohort each belongs to # list of discordant sites -__date__ = '2023-14-11' +__date__ = '2023-07-24' __version__ = '0.0.1' import argparse import sys @@ -532,17 +532,17 @@ def set_results(self,to_set,id): pickle.dump(to_set, f) self.record_dict[id]=f'tmp_{id}.pkl' - def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm): + def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table): donor_concordance_table = {} other_donor_concordance_table = [] for cell1 in Cells_to_keep_pre: count+=1 cell_vars = exclusive_cell_variants[cell1] - result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data) + result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table) cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table) - if count>300: - break + # if count>300: + # break # here we should write these independently to the files if (count % 50 == 0): self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}") @@ -600,6 +600,7 @@ def conc_table(self): expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor + for i,row1 in donor_assignments_table.iterrows(): donor_in_question = row1['donor_query'] donor_gt_match = row1['donor_gt'] @@ -620,10 +621,10 @@ def conc_table(self): except: continue if cpus==1: - result = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm) + result = self.analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table) self.combine_concordances(result) else: - pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm]),callback=self.combine_concordances) + pool.apply_async(self.analyse_donor, args=([Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table]),callback=self.combine_concordances) pool.close() pool.join() @@ -633,7 +634,7 @@ def conc_table(self): return self.cell_concordance_table - def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data): + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table): Concordant_Sites, \ Discordant_sites, \ @@ -677,7 +678,7 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set() - for donor in vars_per_donor_gt: + for donor in set(donor_assignments_table['donor_gt']): expected_vars_norm_of_other_donor = all_donor_data[donor] @@ -769,9 +770,9 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g 'total_sites_otherDonor':total_sites_otherDonor, \ 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ 'total_reads_otherDonor':total_reads_otherDonor, \ - 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \ - 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ - 'concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor + # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \ + # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ + 'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor }) discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) From aa30bb8b9881c57a0b82b533dee933bf7bd2e771 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Thu, 16 Nov 2023 16:20:28 +0000 Subject: [PATCH 4/7] cross cohort contamination --- bin/concordance_calculations.py | 138 +++++++++++++++++++++++++------- 1 file changed, 111 insertions(+), 27 deletions(-) diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py index c1b0b309..1aa49ac8 100644 --- a/bin/concordance_calculations.py +++ b/bin/concordance_calculations.py @@ -308,7 +308,7 @@ def get_discordance(self,expected_vars2,cell_vars2): return Concordant_Sites,Discordant_sites,disc_sites - def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars): + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False): # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. # Author: M.Ozols @@ -443,12 +443,12 @@ def append_results_cell_concordances(self,result,cell_concordance_table,other_do 'GT 2':result['donor_gt_match'], 'cohort': cohort, - 'Nr_Concordant':result['Nr_Concordant'], - 'Nr_Discordant':result['Nr_Discordant'], + # 'Nr_Concordant':result['Nr_Concordant'], + # 'Nr_Discordant':result['Nr_Discordant'], 'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'], 'Nr_strict_discordant':result['true_discordant_count'], - 'Percent Concordant':percent_concordant, - 'Percent Discordant':percent_discordant, + # 'Percent Concordant':percent_concordant, + # 'Percent Discordant':percent_discordant, 'Percent_relaxed_concordant': percent_relaxed_concordant, 'Percent_strict_discordant': percent_strict_discordant, 'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']), @@ -468,7 +468,6 @@ def append_results_cell_concordances(self,result,cell_concordance_table,other_do 'Discordant_reads_informtive': result['discordant_reads_informative'], 'Discordant_reads_uninformtive': result['discordant_reads_uninformative'], 'Discordant_reads_by_n_sites': read_discordance, - 'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']), 'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'], 'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'], @@ -481,7 +480,10 @@ def append_results_cell_concordances(self,result,cell_concordance_table,other_do 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'], 'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], 'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], - 'Discordant_Site_Identities':result['discordant_sites'], + 'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'], + 'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], + 'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], + 'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'] } return [cell_concordance_table,other_donor_concordance_table] @@ -634,6 +636,21 @@ def conc_table(self): return self.cell_concordance_table + + def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm): + # we need this function wrapper to calculate the concordant, discordant read + # counts for each of the discordant sites that are concordant with another donor. + + Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) + cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) + cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) + total_reads,_,_,discordant_reads = self.read_concordance_calc(expected_vars2,cell_vars2) + concordant_reads = total_reads - discordant_reads + return total_reads,discordant_reads,concordant_reads + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table): Concordant_Sites, \ @@ -676,11 +693,23 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g discordant_vars_in_pool = [] donor_table_of_concordances = [] total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} + informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set() + donor_gt_match_cohort = donor_cohorts[donor_gt_match] + donors_contributing_to_out_of_cohort= [] + donors_contributing_to_Whithin_Cohort=[] + for donor in set(donor_assignments_table['donor_gt']): expected_vars_norm_of_other_donor = all_donor_data[donor] + + try: + donor_cohort = donor_cohorts[donor] + donor_vars = vars_per_donor_gt[donor] + except: + continue Concordant_Sites_otherDonor, \ Discordant_sites_otherDonor, \ @@ -707,7 +736,7 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g discordant_read_fraction_in_concordant_sites_otherDonor, \ discordant_read_fraction_in_discordant_sites_otherDonor, \ discordant_reads_uninformative_fraction_otherDonor, \ - discordant_reads_informative_fraction_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars) + discordant_reads_informative_fraction_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort) # here we also need to know : # how many reads of the desired donor discordant sites could be yielded @@ -719,28 +748,57 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor)) Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor)) DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor)) - + # We now count the concordant reads that may contribute to particular cell at this cell. # to do this we take the discordant sites that have been deamed to be concordant with the other donor and quantify the reads thta are concordant. - Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) - expected_vars2 = expected_vars_norm_of_other_donor[expected_vars_norm_of_other_donor['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) - cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) - cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) + # Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) + # expected_vars2 = expected_vars_norm[expected_vars_norm_of_other_donor['ids'].isin(Total_Overlapping_sites)] + # cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + # cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) + # cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) + # cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) - total_reads_for_discordant_sites_that_are_concordant_with_other_donor,total_dp_for_discordant_sites_that_are_concordant_with_other_donor,total_oth_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_concordance_calc(expected_vars2,cell_vars2) - concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = total_reads_for_discordant_sites_that_are_concordant_with_other_donor - discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor + # total_reads_for_discordant_sites_that_are_concordant_with_other_donor,_,_,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_concordance_calc(expected_vars2,cell_vars2) + # concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = total_reads_for_discordant_sites_that_are_concordant_with_other_donor - discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor - try: - donor_cohort = donor_cohorts[donor] - donor_vars = vars_per_donor_gt[donor] - except: - continue + total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor = self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + # if discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor>0: + # print('yes1') + if not donor == donor_gt_match: # We want to kow how many of these discordant site - + if donor_gt_match_cohort == donor_cohort: + coh = 'Whithin_Cohort' + if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: + donors_contributing_to_Whithin_Cohort.append(donor) + else: + coh = 'Out_of_Cohort' + if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: + donors_contributing_to_out_of_cohort.append(donor) + total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)) + # now we addit for a cohort since the biggest issue comes from cohort cross-contamination + # for each of these sites now we calculate the number of reads that it accounts: + # tree level set: cohort: site: counts + + + for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: + total_reads_for_site,discordant_reads_for_site,concordant_for_site = self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + # if discordant_reads_for_site>0: + # print('here') + if concordant_for_site==0: + pass + try: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + except: + try: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + except: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool. # the ones that havent, i have to add the reads up for them. informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor)) @@ -774,8 +832,30 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ 'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor }) - - discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) + + + #here now we want to see overall how many reads potentially come from different cohorts. + cohort_specific_site_quant_string="" + cohort_specific_read_quant_string="" + + Whithin_Cohort__total_number_of_potential_contaminent_reads=0 + try: + for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys(): + Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1]) + except: + _='Doesnt Exist' + + Out_of_Cohort__total_number_of_potential_contaminent_reads=0 + try: + for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys(): + Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1]) + except: + _='Doesnt Exist' + + + # total_reads_for_site,discordant_reads_for_site,concordant_for_site = self.read_extraction(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()),expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + + # discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) concordant_vars_in_pool_str = (";").join(concordant_vars) DF = pd.DataFrame(donor_table_of_concordances) @@ -786,7 +866,7 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0] Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values) Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values) - + return [{ 'cell1':cell1, 'donor_gt_match':donor_gt_match, @@ -823,7 +903,11 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}", 'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}", 'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \ - 'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites + 'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \ + 'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \ + 'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \ + 'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \ + 'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)) }, donor_table_of_concordances] From d77379c9035ddfb066c4aaa91908eaf7c7c4ff0b Mon Sep 17 00:00:00 2001 From: maxozo Date: Fri, 17 Nov 2023 19:26:05 +0000 Subject: [PATCH 5/7] added --- bin/concordance_calculations.py | 50 +++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py index 1aa49ac8..c2433e4f 100644 --- a/bin/concordance_calculations.py +++ b/bin/concordance_calculations.py @@ -702,9 +702,10 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g donors_contributing_to_Whithin_Cohort=[] for donor in set(donor_assignments_table['donor_gt']): - - expected_vars_norm_of_other_donor = all_donor_data[donor] - + try: + expected_vars_norm_of_other_donor = all_donor_data[donor] + except: + continue try: donor_cohort = donor_cohorts[donor] donor_vars = vars_per_donor_gt[donor] @@ -767,6 +768,11 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g if not donor == donor_gt_match: # We want to kow how many of these discordant site + if 'U937' in donor: + continue + if 'THP1' in donor: + continue + if donor_gt_match_cohort == donor_cohort: coh = 'Whithin_Cohort' if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: @@ -839,7 +845,10 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g cohort_specific_read_quant_string="" Whithin_Cohort__total_number_of_potential_contaminent_reads=0 + Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 + Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 try: + Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys(): Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1]) except: @@ -847,12 +856,30 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g Out_of_Cohort__total_number_of_potential_contaminent_reads=0 try: + Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys(): Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1]) except: _='Doesnt Exist' + try: + Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + except: + Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + Out_of_Cohort__sites = set() + + Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_reads_for_site,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm) + + try: + Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + except: + Whithin_Cohort__sites = set() + Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + + Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites) # total_reads_for_site,discordant_reads_for_site,concordant_for_site = self.read_extraction(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()),expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) # discordant_vars_in_pool_str = (";").join(discordant_vars_in_pool) @@ -907,7 +934,14 @@ def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_g 'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \ 'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \ 'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \ - 'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)) + 'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \ + + 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ + 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ + 'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ + 'Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Whithin_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ + 'Total_Reads_for_Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool, \ + 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool) }, donor_table_of_concordances] @@ -1294,12 +1328,18 @@ def donor_exclusive_sites(exclusive_don_variants2): result = pd.DataFrame(cell_concordance_table).T + try: site_identities = result[['Concordant_Site_Identities','Discordant_Site_Identities']] - result.drop(columns=['Concordant_Site_Identities','Discordant_Site_Identities'],inplace=True) + result.drop(columns=['Concordant_Site_Identities'],inplace=True) site_identities.to_csv(f"site_identities_{outfile}",sep='\t') except: _='sample_hasnt_matched_any_gt --- most likely too little cells assigned' + try: + result.drop(columns=['Discordant_Site_Identities'],inplace=True) + except: + _='sample_hasnt_matched_any_gt --- most likely too little cells assigned' + result.to_csv(outfile,sep='\t') print('Processing Done') \ No newline at end of file From 1961a5ae82242666161806022e339d12bb618330 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Tue, 28 Nov 2023 15:48:24 +0000 Subject: [PATCH 6/7] harriets changes --- assets/deploy_scripts/bsub.sh | 2 +- assets/deploy_scripts/bsub__removeWork.sh | 2 +- assets/deploy_scripts/bsub_test.sh | 2 +- assets/deploy_scripts/bsub_test_celltypes.sh | 2 +- assets/deploy_scripts/bsub_test_recluster.sh | 29 + .../input_setups/recluster_profile.nf | 138 +++++ .../nohup_start_nextflow_lsf.sh | 2 +- .../nohup_start_nextflow_lsf__removeWork.sh | 2 +- .../nohup_start_nextflow_lsf_celltypes.sh | 2 +- .../nohup_start_nextflow_lsf_recluster.sh | 27 + .../nohup_start_nextflow_lsf_test.sh | 2 +- bin/0026-plot_filtered_cells.py | 17 +- bin/0028-plot_predicted_sex.py | 5 +- bin/0030-estimate_pca_elbow.py | 5 +- bin/0035-scanpy_normalize_pca.py | 143 +---- ...canpy_cluster_validate_resolution-keras.py | 3 +- bin/pca_anndata.py | 556 ++++++++++++++++++ conf/base.conf | 15 +- main.nf | 13 +- .../nf-core/modules/clustering/functions.nf | 2 +- modules/nf-core/modules/clustering/main.nf | 82 ++- .../modules/estimate_pca_elbow/main.nf | 3 - .../nf-core/modules/normalise_and_pca/main.nf | 95 +-- subworkflows/qc.nf | 84 ++- workflows/yascp.nf | 21 +- 25 files changed, 954 insertions(+), 300 deletions(-) create mode 100755 assets/deploy_scripts/bsub_test_recluster.sh create mode 100644 assets/deploy_scripts/input_setups/recluster_profile.nf create mode 100755 assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh create mode 100755 bin/pca_anndata.py diff --git a/assets/deploy_scripts/bsub.sh b/assets/deploy_scripts/bsub.sh index bcbe9189..0d9012b8 100755 --- a/assets/deploy_scripts/bsub.sh +++ b/assets/deploy_scripts/bsub.sh @@ -21,5 +21,5 @@ if ["$varname" = '']; fi sample="$RUN_ID" echo -e "\n Submitting yascp (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE" -bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE +bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE echo "Submitted job can be killed with: bkill -J $sample" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub__removeWork.sh b/assets/deploy_scripts/bsub__removeWork.sh index 1f2e5dfa..f1bffef1 100755 --- a/assets/deploy_scripts/bsub__removeWork.sh +++ b/assets/deploy_scripts/bsub__removeWork.sh @@ -5,5 +5,5 @@ INPUT_FILE=$1 export RUN_ID="${PWD##*/}" sample="$RUN_ID.yascp" echo "Cleaning the work directory (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE by using '-entry WORK_DIR_REMOVAL --remove_work_dir' " -bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE +bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE echo "Submitted job can be killed with: bkill -J $sample" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub_test.sh b/assets/deploy_scripts/bsub_test.sh index 8a163fff..52b474dd 100755 --- a/assets/deploy_scripts/bsub_test.sh +++ b/assets/deploy_scripts/bsub_test.sh @@ -25,5 +25,5 @@ fi sample="$RUN_ID.yascp" echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in test mode withsample OneK1k dataset" -bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh +bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh echo "Submitted job can be killed with: bkill -J yascp_test" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub_test_celltypes.sh b/assets/deploy_scripts/bsub_test_celltypes.sh index 3c6dc200..7a12a9ac 100755 --- a/assets/deploy_scripts/bsub_test_celltypes.sh +++ b/assets/deploy_scripts/bsub_test_celltypes.sh @@ -25,5 +25,5 @@ fi sample="$RUN_ID.yascp" echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_CELLTYPES mode with input file $INPUT_FILE" -bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh $INPUT_FILE +bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh $INPUT_FILE echo "Submitted job can be killed with: bkill -J yascp_celltypes" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub_test_recluster.sh b/assets/deploy_scripts/bsub_test_recluster.sh new file mode 100755 index 00000000..7c8d6b97 --- /dev/null +++ b/assets/deploy_scripts/bsub_test_recluster.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +CWD1="$PWD" +parentdir="$(dirname "$CWD1")" +INPUT_FILE=$1 +export RUN_ID="${PWD##*/}" + +# export SINGULARITY_CACHEDIR='/software/hgi/containers/yascp' + +export NXF_OPTS="-Xms5G -Xmx5G" +export SINGULARITY_TMPDIR=$PWD/work/tmp +export TEMP=$PWD/work/tmp +export TMP_DIR=$PWD/work/tmp + +echo press ENTER to NOT fetch containers, otherwise provide writable path: +read varname + +if ["$varname" = '']; + then + export NXF_SINGULARITY_CACHEDIR='/software/hgi/containers/yascp' + export SINGULARITY_DISABLE_CACHE=0 + else + echo Yascp Will fetch the containers and place them in $varname + export NXF_SINGULARITY_CACHEDIR=$varname +fi + +sample="$RUN_ID.yascp" +echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_RECLUSTER mode with input file $INPUT_FILE" +bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_cluster -n 1 -M 4000 -o yascp_cluster.o -e yascp_cluster.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh $INPUT_FILE +echo "Submitted job can be killed with: bkill -J yascp_cluster" \ No newline at end of file diff --git a/assets/deploy_scripts/input_setups/recluster_profile.nf b/assets/deploy_scripts/input_setups/recluster_profile.nf new file mode 100644 index 00000000..cd84a1b0 --- /dev/null +++ b/assets/deploy_scripts/input_setups/recluster_profile.nf @@ -0,0 +1,138 @@ +params { + + lisi{ + run_process=true + } + replace_genotype_ids=false + write_h5=true + cluster_validate_resolution_keras = true + // run_celltype_assignment = true + project_name = 'T_Cell_Bio_Response' + filter_outliers = false + extra_sample_metadata ="" + output_dir = outdir= "${launchDir}/recluster_resolutions" + cellex_cluster_markers=true + cluster_markers = false + normalise_andata = false + skip_handover = true + // output_dir = outdir= "${launchDir}/results" + // run_celltype_assignment=true + split_ad_per_bach=true //if not splitting the celltype assignment will be run on full tranche + // input_data_table = "$outdir/handover/Summary_plots/$RUN_ID/Fetch Pipeline/Input/input_table.tsv" + // cellbender_location="${output_dir}/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting input = 'existing_cellbender' instead input = 'cellbender' + // existing_cellsnp="${output_dir}/cellsnp" + cellbender_location="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results_11_09_2023/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting input = 'existing_cellbender' instead input = 'cellbender' + existing_cellsnp="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results/cellsnp" + + skip_preprocessing = true + // file__anndata_merged = '/lustre/scratch126/humgen/projects/sc-eqtl-ibd/analysis/harriet_analysis/230313_hb58_yascp_analysis/231114_h5ad_files_for_MCC/231120_TCs_only_regressed_counts_HVGs.h5ad' + + harmony{ + run_process= true + } + umap{ + run_process = true + colors_quantitative{ + description = 'Comma separated string of quantitative variables that will be used to color points.' + value = 'n_cells,total_counts,pct_counts_gene_group__mito_transcript,prob_doublet,pct_counts_gene_group__ribo_rna,Azimuth:predicted.celltype.l2.score,Azimuth:mapping.score,log10_ngenes_by_count' + } + colors_categorical{ + description = 'Comma separated string of categorical variables that will be used to color points.' + value = 'cell_passes_qc,cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2,experiment_id,Azimuth:predicted.celltype.l2,Celltypist:Immune_All_Low:predicted_labels,Celltypist:Immune_All_High:predicted_labels,donor_id' + } + } + + mads_categories ='pct_counts_gene_group__mito_transcript,pct_counts_gene_group__mito_protein,pct_counts_gene_group__ribo_protein,pct_counts_gene_group__ribo_rna,total_counts,n_genes_by_counts,log10_ngenes_by_count' + // hard_filters_file = "${projectDir}/../sample_qc.yml" + // hard_filters_drop = false //#This indicates whether we want to drop the cells that fail hard filters of just flag them + + cluster{ + description = """Parameters for clustering. All pairwise combinations of + method and resolution will be performed.""" + number_neighbors{ + description = """Number of neighbors. If <= 0, uses number of unique + experiment_id.""" + value = 15 + } + methods{ + description = 'Clustering method. Valid options [leiden|louvain].' + value = 'leiden' + } + resolutions{ + description = 'Clustering resolution.' + value = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] + } + + variables_boxplot{ + decription = 'Generate boxplots of these variables for each cluster.' + value ='n_cells,total_counts,pct_counts_gene_group__mito_transcript' + } + + known_markers{ + run_process = false + description = """Files with markers that will be used to generate + dotplots. Each marker file should be the full path and have the + following columns: cell_type, hgnc_symbol. The following columns + are optional: p_value_adj. Use "" for a single entry in the + file_id and file value to indicate no plots.""" + value = [ + [ file_id: 'SmillieCS_31348891', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/SmillieCS-31348891/database.tsv' ], + [ file_id: 'ParikhK_30814735', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/ParikhK-30814735/database.tsv' ], + [ file_id: 'JamesKR_32066951', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon-immune/JamesKR-32066951/database.tsv' ] + ] + } + + + + + } + bbknn{ + run_process = true + } + + celltype_assignment{ + run_celltype_assignment=false + run_azimuth=true + run_keras=false + run_celltypist=true + } + reduced_dims{ + vars_to_regress{ + value = '' + } + } + +} + +process { + + withName: plot_distributions{ + containerOptions = "--containall --cleanenv --workdir /tmp -B /tmp" + } + + withName: cellex_cluster_markers{ + maxForks=7 + memory = 300.GB + } + + withName: GATHER_DATA{ + maxForks=7 + memory = 100.GB + } + withName: LISI{ + maxForks=7 + memory = 300.GB + } + withName: cluster_validate_resolution_keras{ + memory = 300.GB + } + + withName: umap_calculate_and_plot{ + memory = 300.GB + } + + withName: sccaf_assess_clustering{ + memory = 300.GB + } + +} diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh index f1dfcbc0..ee8066ab 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh @@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")" export RUN_ID="${PWD##*/}" mkdir $PWD/work || echo 'exists' mkdir $PWD/work/tmp || echo 'exists' -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh index f640bbf3..28db82dc 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh @@ -21,7 +21,7 @@ export RUN_ID="${PWD##*/}" # export TEMP=$PWD/tmp # export TMP_DIR=$PWD/tmp -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh index 800475d7..295cf5c7 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh @@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")" export RUN_ID="${PWD##*/}" mkdir $PWD/work || echo 'exists' mkdir $PWD/work/tmp || echo 'exists' -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh new file mode 100755 index 00000000..995e07e5 --- /dev/null +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +INPUT_FILE=$1 +dt=`date +"%Y_%m_%d_%T"` +cp nextflow.nohup.log ./nextflow.nohup_$dt.log2 || echo 'first time running' +# activate Nextflow conda env + +# clean up previous run files +rm -f *.log +rm -f nextflow.nohup.PID.txt + +# start Nextflow in background: +export NXF_OPTS="-Xms5G -Xmx5G" + +CWD1="$PWD" +parentdir="$(dirname "$CWD1")" +# export RUN_ID="${parentdir##*/}" +export RUN_ID="${PWD##*/}" +mkdir $PWD/work || echo 'exists' +mkdir $PWD/work/tmp || echo 'exists' +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_RECLUSTER -c /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/input_setups/recluster_profile.nf -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & + +# get process PID +sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") +echo $PID > nextflow.nohup.PID.txt +echo "Nextflow PID is $PID (saved in ./nextflow.nohup.PID.txt)" +echo kill with \"kill $PID\" +echo "check logs files nextflow.nohup.log and .nextflow.log" diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh index 6a8e1946..cc5fd45d 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh @@ -16,7 +16,7 @@ parentdir="$(dirname "$CWD1")" export RUN_ID="${PWD##*/}" mkdir $PWD/work || echo 'exists' mkdir $PWD/work/tmp || echo 'exists' -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/bin/0026-plot_filtered_cells.py b/bin/0026-plot_filtered_cells.py index 1752b769..7295070b 100755 --- a/bin/0026-plot_filtered_cells.py +++ b/bin/0026-plot_filtered_cells.py @@ -67,13 +67,16 @@ def main(): # Check if any difference between before and after filters. If not, # return early. df_after_filters = df[df.filter_type.isin(['after_filters'])] - filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[ - df_after_filters.experiment_id, - 'n_cells_left_in_adata' - ].values - if all(filt): - print("No difference detected before and after filters. No plots.") - return() + try: + filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[ + df_after_filters.experiment_id, + 'n_cells_left_in_adata' + ].values + if all(filt): + print("No difference detected before and after filters. No plots.") + return() + except: + return() # Set some plotting parameters plt_height = 16 # 1.5 * df.experiment_id.nunique() diff --git a/bin/0028-plot_predicted_sex.py b/bin/0028-plot_predicted_sex.py index 700e98d3..fad7fab6 100755 --- a/bin/0028-plot_predicted_sex.py +++ b/bin/0028-plot_predicted_sex.py @@ -60,7 +60,10 @@ def main(): # Load the AnnData file adata = sc.read_h5ad(filename=options.h5) - + try: + adata.X=adata.layers['counts'] + except: + _='counts may be already set' # If we have a flag for cells that pass QC then filter down to them if 'cell_passes_qc' in adata.obs: adata = adata[adata.obs['cell_passes_qc'], :] diff --git a/bin/0030-estimate_pca_elbow.py b/bin/0030-estimate_pca_elbow.py index 75c1e490..0a952160 100755 --- a/bin/0030-estimate_pca_elbow.py +++ b/bin/0030-estimate_pca_elbow.py @@ -78,7 +78,10 @@ def main(): # Read in the dataframe adata = sc.read_h5ad(filename=options.h5) - + try: + adata.X=adata.layers['counts'] + except: + _='counts may be already set' kneedle_dict = {} output_dict = {} diff --git a/bin/0035-scanpy_normalize_pca.py b/bin/0035-scanpy_normalize_pca.py index 99747ec5..93c6ba67 100755 --- a/bin/0035-scanpy_normalize_pca.py +++ b/bin/0035-scanpy_normalize_pca.py @@ -372,7 +372,6 @@ def scanpy_normalize_and_pca( sc.pp.filter_genes(adata, min_cells=5) # Only consider genes expressed in more than 0.5% of cells: # sc.pp.filter_genes(adata, min_cells=0.005*len(adata.obs.index)) - # Total-count normalize (library-size correct) the data matrix X to # counts per million, so that counts become comparable among cells. sc.pp.normalize_total( @@ -385,26 +384,8 @@ def scanpy_normalize_and_pca( # Logarithmize the data: X = log(X + 1) where log = natural logorithm. # Numpy has a nice function to undo this np.expm1(adata.X). sc.pp.log1p(adata) - # Delete automatically added uns - UPDATE: bad idea to delete as this slot - # is used in _highly_variable_genes_single_batch. - # del adata.uns['log1p'] - # Add record of this operation. - # adata.layers['log1p_cpm'] = adata.X.copy() - # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'} adata.layers['log1p_cp10k'] = adata.X.copy() adata.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'} - - # Stash the unprocessed data in the raw slot. - # adata.raw.X.data is now ln(CPM+1). - # NOTE: - Layers are not preserved in adata.raw, though obs, var, uns are. - # - If genes are filtered (e.g., - # sc.pp.filter_genes(adata, min_cells=1)), the full dataset will - # remain in the raw slot. - # - We store in the raw slot because later for UMAP and marker gene - # analysis, we can easily tell scanpy to use the raw slot via the - # use_raw = True flag. Raw was specifically designed for this use - # case of ln(CPM+1), - # Can be deleted later: del adata.raw adata.raw = adata # adata_raw = adata.raw.to_adata() @@ -433,32 +414,7 @@ def scanpy_normalize_and_pca( batch_key=variable_feature_batch_key, inplace=True ) - if verbose: - print('{}: {} (all batches); {} ({})'.format( - 'Number of variable features detected', - adata.var['highly_variable_intersection'].sum(), - adata.var['highly_variable'].sum(), - 'after ranking the number of batches where a feature is variable' - )) - # If n_top_genes = None, then one needs to set 'highly_variable'. - # Here, highly_variable_intersection is only true for genes variable across - # all batch keys (i.e., 'highly_variable_nbatches' = n_batch_keys): - # adata.var.loc[ - # adata.var["highly_variable_intersection"], - # ["highly_variable_nbatches"] - # ] - # - # If n_top_genes = None, then one also needs needs to set highly_variable'. - # Fix bug in PCA when we have set batch_key. More below: - # https://github.com/theislab/scanpy/issues/1032 - # adata.var['highly_variable'] = adata.var['highly_variable_intersection'] - # - # Alternatively, if one specifies n_top_genes, then genes are ranked by - # 'highly_variable_nbatches' and highly_variable is set to the top n. - # adata.var.loc[ - # adata.var["highly_variable"], - # ["highly_variable_nbatches"] - # ] + if plot: # Plot highly variable genes. @@ -609,106 +565,15 @@ def scanpy_normalize_and_pca( copy=False ) + # Keep a record of the different gene scores if score_genes_df is not None: adata.uns['df_score_genes'] = score_genes_df_updated - # Calculate PCs. - - seed_value = 0 - # 0. Set `PYTHONHASHSEED` environment variable at a fixed value - os.environ['PYTHONHASHSEED'] = str(seed_value) - # 1. Set `python` built-in pseudo-random generator at a fixed value - random.seed(seed_value) - # 2. Set `numpy` pseudo-random generator at a fixed value - np.random.seed(seed_value) - - sc.tl.pca( - adata, - n_comps=min(200, adata.var['highly_variable'].sum()), - zero_center=True, # Set to true for standard PCA - svd_solver='arpack', # arpack reproducible when zero_center = True - use_highly_variable=True, - copy=False, - random_state=np.random.RandomState(0), - chunked=False - ) - # pca( - # adata, - # n_comps=min(200, adata.var['highly_variable'].sum()), - # svd_solver='arpack', # lobpcg not found in current sklearn - # use_highly_variable=True, - # copy=False - # ) - - # Save PCs to a seperate file for Harmony. - pca_df = pd.DataFrame( - adata.obsm['X_pca'], - index=adata.obs_names, - columns=[ - 'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1) - ] - ) - pca_df.to_csv( - '{}-pcs.tsv.gz'.format(output_file), - sep='\t', - index=True, - index_label='cell_barcode', - na_rep='', - compression=compression_opts - ) - - # Save the metadata to a seperate file for Harmony. - adata.obs.to_csv( - '{}-metadata.tsv.gz'.format(output_file), - sep='\t', - index=True, - quoting=csv.QUOTE_NONNUMERIC, - index_label='cell_barcode', - na_rep='', - compression=compression_opts - ) - # Save the data. adata.write( - '{}-normalized_pca.h5ad'.format(output_file), + '{}-normalized.h5ad'.format(output_file), compression='gzip' - #compression_opts=anndata_compression_opts - ) - # adata_merged.write_csvs(output_file) - # adata_merged.write_loom(output_file+".loom")) - - # Plot the PC info. - if plot: - # Plot the vanilla PCs. - # sc.pl.pca( - # adata, - # color='experiment_id', - # components=['1,2', '3,4'] - # ) - _ = sc.pl.pca_variance_ratio( - adata, - n_pcs=adata.obsm['X_pca'].shape[1], - log=False, - show=False, - save='-{}.pdf'.format(output_file) - ) - _ = sc.pl.pca_variance_ratio( - adata, - n_pcs=adata.obsm['X_pca'].shape[1], - log=True, - show=False, - save='-{}-log.pdf'.format(output_file) - ) - - # Save the filtered count matrix for input to other software like scVI - adata.X = adata.layers['counts'] - del adata.layers['counts'] - del adata.raw - adata.write( - '{}-normalized_pca-counts.h5ad'.format(output_file), - compression='gzip' - #compression_opts=anndata_compression_opts ) return(output_file) @@ -852,7 +717,7 @@ def main(): drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering # Load the AnnData file adata = sc.read_h5ad(filename=options.h5) - + # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad') # if this is the subclustering analysis, the count matrix should be used # by default, the analysis is "conventional" and thus will be skipped if options.layer != "none": diff --git a/bin/0057-scanpy_cluster_validate_resolution-keras.py b/bin/0057-scanpy_cluster_validate_resolution-keras.py index 0dafeafa..f5579e9a 100755 --- a/bin/0057-scanpy_cluster_validate_resolution-keras.py +++ b/bin/0057-scanpy_cluster_validate_resolution-keras.py @@ -519,7 +519,8 @@ def main(): # Virtual devices must be set before GPUs have been initialized print(e) else: - raise Exception('ERROR: no GPUs detected.') + _ = 'running without gpus' + # raise Exception('ERROR: no GPUs detected.') # Get additional data we are going to append to the output model info dict_add = {} diff --git a/bin/pca_anndata.py b/bin/pca_anndata.py new file mode 100755 index 00000000..591c9ad4 --- /dev/null +++ b/bin/pca_anndata.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python + + +__date__ = '2020-03-13' +__version__ = '0.0.1' + +import argparse +from distutils.version import LooseVersion +import os +os.environ['NUMBA_CACHE_DIR']='/tmp' +os.environ['MPLCONFIGDIR']='/tmp' +import random +import numpy as np +import scipy as sp +# import sklearn.utils +import sklearn.decomposition +import pandas as pd +import scanpy as sc +import csv +import time +from datetime import timedelta + +# Set seed for reproducibility +seed_value = 0 +# 0. Set `PYTHONHASHSEED` environment variable at a fixed value +os.environ['PYTHONHASHSEED'] = str(seed_value) +# 1. Set `python` built-in pseudo-random generator at a fixed value +random.seed(seed_value) +# 2. Set `numpy` pseudo-random generator at a fixed value +np.random.seed(seed_value) + +# Set scanpy settings +# sc verbosity: errors (0), warnings (1), info (2), hints (3) +# sc.settings.verbosity = 3 +# sc.logging.print_versions() +# sc.settings.set_figure_params(dpi=80) + + +def pca( + data, + n_comps=None, + svd_solver='arpack', + use_highly_variable=None, + copy=False +): + """Compute PCA coordinates, loadings and variance decomposition. + + Derived from scanpy 1.5.1. + Principal component analysis [Pedregosa11]_.] + Uses the implementation of *scikit-learn* [Pedregosa11]_. + + Parameters + ---------- + data + The (annotated) data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + n_comps + Number of principal components to compute. Defaults to 50, or 1 - + minimum dimension size of selected representation. + svd_solver + SVD solver to use: + `'arpack'` (the default) + for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) + `'randomized'` + for the randomized algorithm due to Halko (2009). + `'auto'` + chooses automatically depending on the size of the problem. + `'lobpcg'` + An alternative SciPy solver. + .. versionchanged:: 1.4.5 + Default value changed from `'auto'` to `'arpack'`. + Efficient computation of the principal components of a sparse matrix + currently only works with the `'arpack`' or `'lobpcg'` solvers. + use_highly_variable + Whether to use highly variable genes only, stored in + `.var['highly_variable']`. + By default uses them if they have been determined beforehand. + copy + If an :class:`~anndata.AnnData` is passed, determines whether a copy + is returned. Is ignored otherwise. + Returns + ------- + adata : anndata.AnnData + …otherwise if `copy=True` it returns or else adds fields to `adata`: + `.obsm['X_pca']` + PCA representation of data. + `.varm['PCs']` + The principal components containing the loadings. + `.uns['pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pca']['variance']` + Explained variance, equivalent to the eigenvalues of the + covariance matrix. + """ + adata = data.copy() if copy else data + + if use_highly_variable and 'highly_variable' not in adata.var.keys(): + raise ValueError( + 'Did not find adata.var[\'highly_variable\']. ' + 'Either your data already only consists of highly-variable genes ' + 'or consider running `pp.highly_variable_genes` first.' + ) + if use_highly_variable is None: + if 'highly_variable' in adata.var.keys(): + use_highly_variable = True + else: + use_highly_variable = False + + if use_highly_variable: + adata_comp = ( + adata[:, adata.var['highly_variable']] + ) + else: + adata_comp = adata + + if n_comps is None: + min_dim = min(adata_comp.n_vars, adata_comp.n_obs) + n_comps = min_dim - 1 + + # random_state = sklearn.utils.check_random_state(random_state) + X = adata_comp.X + + # If sparse, make dense. + # Another option: + # output = _pca_with_sparse( + # X, n_comps, solver=svd_solver, random_state=random_state + # ) + if sp.sparse.issparse(X): + X = X.toarray() + + # Sort out the solver + if svd_solver == 'auto': + svd_solver = 'arpack' + if svd_solver not in {'arpack', 'randomized'}: + raise ValueError( + 'svd_solver: {svd_solver} can not be used with sparse input.' + ) + + pca_ = sklearn.decomposition.PCA( + n_components=n_comps, + svd_solver=svd_solver, + random_state=0 + ) + X_pca = pca_.fit_transform(X) + + # Cast to whatever datatype. + # dtype = 'float32' + # dtype + # Numpy data type string to which to convert the result. + # if X_pca.dtype.descr != np.dtype(dtype).descr: + # X_pca = X_pca.astype(dtype) + + # Update the adata frame (if copy=False, then this is the same input adata + # that the user provided) + adata.obsm['X_pca'] = X_pca + adata.uns['pca'] = {} + adata.uns['pca']['params'] = { + 'zero_center': True, + 'use_highly_variable': use_highly_variable, + } + if use_highly_variable: + adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) + adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T + else: + adata.varm['PCs'] = pca_.components_.T + adata.uns['pca']['variance'] = pca_.explained_variance_ + adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ + + return adata if copy else None + + +def score_cells( + adata, + score_genes_df, + score_genes_df_column='ensembl_gene_id', + only_use_variable_genes=False +): + """Scores each cell. + + Parameters + ---------- + adata : AnnData + Input AnnData object. Assume adata.X is norm->log1p->scaled data. + score_genes_df : pd.DataFrame + Dataframe of marker genes. Needs to have score_genes_df_column and + score_id column. If one score_id == 'cell_cycle', then requires a + grouping_id column with 'G2/M' and 'S'. + score_genes_df_column : string + Column in score_genes_df to use for gene ids (e.g., hgnc_symbol, + ensembl_gene_id) + only_use_variable_genes : boolean + Only use variable genes to calculate scores. If True, score_id will + be changed to __hvg_only. Note this flage does not apply + to score_id == 'cell_cycle'. + + + Returns + ------- + adata : AnnData + AnnData object with scores calculated and stored in + adata.obs[]. + score_genes_df : pd.DataFrame + The score_genes_df with the following columns added: + gene_found_in_adata, gene_found_is_highly_variable. It is suggested + that this dataframe is added to the adata.uns slot. + """ + verbose = False # For debugging purposes. + + # Update the score_genes_df with details on the genes and if they were + # found in adata and if they are highly variable. + score_genes_df['gene_found_in_adata'] = np.in1d( + score_genes_df[score_genes_df_column], + adata.var.index + ) + score_genes_df['gene_found_is_highly_variable'] = np.in1d( + score_genes_df[score_genes_df_column], + adata.var.index[adata.var['highly_variable']] + ) + + # Set the gene pool parameter. + gene_pool = None # If None, all genes are randomly sampled for background + if only_use_variable_genes: + gene_pool = adata.var.index[adata.var['highly_variable']] + + # Loop over each score_id in score_genes_df, updating adata. + for score_id, df_group in score_genes_df.groupby('score_id'): + # Downsample to only those genes found in the data. + df_group = df_group.loc[ + df_group['gene_found_in_adata'], : + ] + if df_group.shape[0] == 0: + continue + + # If we are supposed to use only_use_variable_genes, then do so. + if only_use_variable_genes: + if score_id == 'cell_cycle': + continue + score_id = '{}__hvg_only'.format(score_id) + df_group = df_group.loc[ + df_group['gene_found_is_highly_variable'], : + ] + if df_group.shape[0] == 0: + continue + if verbose: + print('Scoring {}'.format(score_id)) + + # Set the number of control genes. + ctrl_size = 50 + if df_group.shape[0] > 50: + ctrl_size = df_group.shape[0] + if gene_pool is not None: + if ctrl_size > len(gene_pool): + raise Exception( + 'Error in gene scoring ctrl_size > len(gene_pool)' + ) + + # If the score_id is cell_cycle, then use the specific cell cycle + # scoring function. + if score_id == 'cell_cycle': + # NOTE: Setting ctrl_size` is not possible, as it's set as + # `min(len(s_genes), len(g2m_genes))`. + sc.tl.score_genes_cell_cycle( + adata, + s_genes=df_group.loc[ + df_group['grouping_id'] == 'S', score_genes_df_column + ], + g2m_genes=df_group.loc[ + df_group['grouping_id'] == 'G2/M', score_genes_df_column + ], + copy=False, + gene_pool=gene_pool, # Default is None (aka, use all) + n_bins=25, # Default is 25 + use_raw=False + ) + else: + sc.tl.score_genes( + adata, + df_group[score_genes_df_column], + ctrl_size=ctrl_size, # Default is 50 + gene_pool=gene_pool, # Default is None (aka, use all) + n_bins=25, # Default is 25 + score_name=score_id, + random_state=0, # Default is 0 + copy=False, + use_raw=False + ) + + return adata, score_genes_df + + +def pca_analysis( + adata, + output_file, + variable_feature_batch_key='experiment_id', + n_variable_features=2000, + exclude_hv_gene_df=[], + score_genes_df=None, + verbose=True, + plot=True, + anndata_compression_opts=4 +): + + # Calculate PCs. + seed_value = 0 + # 0. Set `PYTHONHASHSEED` environment variable at a fixed value + os.environ['PYTHONHASHSEED'] = str(seed_value) + # 1. Set `python` built-in pseudo-random generator at a fixed value + random.seed(seed_value) + # 2. Set `numpy` pseudo-random generator at a fixed value + np.random.seed(seed_value) + + sc.tl.pca( + adata, + n_comps=min(200, adata.var['highly_variable'].sum()), + zero_center=True, # Set to true for standard PCA + svd_solver='arpack', # arpack reproducible when zero_center = True + use_highly_variable=True, + copy=False, + random_state=np.random.RandomState(0), + chunked=False + ) + + # Save PCs to a seperate file for Harmony. + pca_df = pd.DataFrame( + adata.obsm['X_pca'], + index=adata.obs_names, + columns=[ + 'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1) + ] + ) + + compression_opts = 'gzip' + if LooseVersion(pd.__version__) > '1.0.0': + compression_opts = dict( + method='gzip', + compresslevel=9 + ) + + pca_df.to_csv( + '{}-pcs.tsv.gz'.format(output_file), + sep='\t', + index=True, + index_label='cell_barcode', + na_rep='', + compression=compression_opts + ) + + # Save the metadata to a seperate file for Harmony. + adata.obs.to_csv( + '{}-metadata.tsv.gz'.format(output_file), + sep='\t', + index=True, + quoting=csv.QUOTE_NONNUMERIC, + index_label='cell_barcode', + na_rep='', + compression=compression_opts + ) + + # Save the data. + adata.write( + '{}-normalized_pca.h5ad'.format(output_file), + compression='gzip' + ) + # Plot the PC info. + if plot: + # Plot the vanilla PCs. + sc.pl.pca_variance_ratio( + adata, + n_pcs=adata.obsm['X_pca'].shape[1], + log=False, + show=False, + save='-{}.pdf'.format(output_file) + ) + + sc.pl.pca_variance_ratio( + adata, + n_pcs=adata.obsm['X_pca'].shape[1], + log=True, + show=False, + save='-{}-log.pdf'.format(output_file) + ) + + # Save the filtered count matrix for input to other software like scVI + adata.X = adata.layers['counts'] + del adata.layers['counts'] + del adata.raw + adata.write( + '{}-normalized_pca-counts.h5ad'.format(output_file), + compression='gzip' + #compression_opts=anndata_compression_opts + ) + + +def main(): + """Run CLI.""" + parser = argparse.ArgumentParser( + description=""" + Read anndata object. Normalize, calculate PCs. Save new anndata + object along with csv file of PCs. + """ + ) + + parser.add_argument( + '-v', '--version', + action='version', + version='%(prog)s {version}'.format(version=__version__) + ) + + parser.add_argument( + '-h5', '--h5_anndata', + action='store', + dest='h5', + required=True, + help='H5 AnnData file.' + ) + + parser.add_argument( + '-layer', '--overwrite_x_with_layer', + action='store', + dest='layer', + default='none', + help='Specify a layer of the AnnData file, which should be used for \ + the following normalization and downstream analysis. This should \ + go together with the analysis mode of the pipeline as \ + "conventional" or "subclustering". \ + (default: %(default)s)' + ) + + parser.add_argument( + '-bk', '--batch_key', + action='store', + dest='bk', + default='experiment_id', + help='Batch key for highly-variable feature (e.g., gene) detection.\ + If specified, highly-variable features are selected within each\ + batch separately and merged.\ + (default: %(default)s)' + ) + + parser.add_argument( + '-nvf', '--number_variable_features', + action='store', + dest='nvf', + default=2000, + type=int, + help='After calculating variable features within each batch set via\ + , rank features by number of batches where they are\ + variable and select the top .\ + (default: %(default)s)' + ) + + parser.add_argument( + '-vge', '--variable_genes_exclude', + action='store', + dest='vge', + default='', + help='Tab-delimited file with genes to exclude from the highly\ + variable gene list. Must contain ensembl_gene_id column.\ + (default: None - keep all variable genes)' + ) + + parser.add_argument( + '-vr', '--vars_to_regress', + action='store', + dest='vr', + default='', + help='Comma seperated list of metadata variables to regress prior to\ + calculating PCs. Example: gene_group__mito_transcript,n_count.\ + (default: "" and sc.pp.regress_out is not called)' + ) + + parser.add_argument( + '-sg', '--score_genes', + action='store', + dest='sg', + default='', + help='Tab-delimited file of genes for scores. Needs to have\ + ensembl_gene_id and score_id column. If one\ + score_id == "cell_cycle", then requires a grouping_id column with\ + "G2/M" and "S".' + ) + + parser.add_argument( + '-drop_cell_passes_qc_from_clustering', '--drop_cell_passes_qc_from_clusteringdrop_cell_passes_qc_from_clustering', + action='store', + dest='drop_cell_passes_qc_from_clustering', + default=False, + help='Whether we want to drop cells before clustering based on the cell_passes_qc filter established by outlier filter part of pipeline' + ) + + + parser.add_argument( + '-ncpu', '--number_cpu', + action='store', + dest='ncpu', + default=4, + type=int, + help='Number of CPUs to use.\ + (default: %(default)s)' + ) + + parser.add_argument( + '--anndata_compression_opts', + action='store', + dest='anndata_compression_opts', + default=4, + type=int, + help='Compression level in anndata. A larger value decreases disk \ + space requirements at the cost of compression time. \ + (default: %(default)s)' + ) + + parser.add_argument( + '-of', '--output_file', + action='store', + dest='of', + default='adata-normalize_pca', + help='Directory and basename of output files.\ + (default: %(default)s)' + ) + + options = parser.parse_args() + + # Scanpy settings + sc.settings.figdir = os.getcwd() # figure output directory to match base. + sc.settings.n_jobs = options.ncpu # number CPUs + # sc.settings.max_memory = 500 # in Gb + # sc.set_figure_params(dpi_save = 300) + drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering + # Load the AnnData file + adata = sc.read_h5ad(filename=options.h5) + try: + del adata.uns + except: + _='still there' + # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad') + # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/91/676237d4521fd78b293a8c4e548394/adata-normalized_pca.h5ad') + + start_time = time.time() + + pca_analysis( + adata, + output_file=options.of, + variable_feature_batch_key=options.bk, + n_variable_features=options.nvf, + anndata_compression_opts=options.anndata_compression_opts + ) + execution_summary = "Analysis execution time [{}]:\t{}".format( + "pca.py", + str(timedelta(seconds=time.time()-start_time)) + ) + print(execution_summary) + + +if __name__ == '__main__': + main() diff --git a/conf/base.conf b/conf/base.conf index d31eed54..cc41cee9 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -36,15 +36,16 @@ params{ mem1= 12000 copy_mode = "rellink" split_bam = false + cluster_markers = true existing_cellsnp="${projectDir}/assets/existing_cellsnp" existing_vireo='' - skip_preprocessing{ - value=false - gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc - gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype. - file__anndata_merged = '' - file__cells_filtered = '' - } + normalise_andata = true + skip_preprocessing=false + gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc + gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype. + file__anndata_merged = '' + file__cells_filtered = '' + id_in='experiment_id' genotype_phenotype_mapping_file ='' extra_sample_metadata = '' use_phenotype_ids_for_gt_match = true //#if false this will keep the genotype ids, for this to be used have to set a genotype_phenotype_mapping_file to a path to csv where firs column contains genotype ids and second contains phenotype ids to replace these to. diff --git a/main.nf b/main.nf index c6dea7ed..138101bf 100755 --- a/main.nf +++ b/main.nf @@ -13,6 +13,9 @@ include { YASCP } from "$projectDir/workflows/yascp" include { RETRIEVE_RECOURSES;RETRIEVE_RECOURSES_TEST_DATASET } from "$projectDir/subworkflows/local/retrieve_recourses" include {RSYNC_RESULTS_REMOVE_WORK_DIR} from "$projectDir/modules/local/rsync_results_remove_work_dir/main" include {celltype} from "$projectDir/subworkflows/celltype" +include {qc} from "$projectDir/subworkflows/qc" +include {dummy_filtered_channel} from "$projectDir/modules/nf-core/modules/merge_samples/functions" + ////// WORKFLOW: Run main nf-core/yascp analysis pipeline // This is the default entry point, we have others to update ceirtain parts of the results. // Please go to ./workflows/yascp to see the main Yascp workflow. @@ -46,11 +49,19 @@ workflow { workflow JUST_CELLTYPES{ - file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged) + file__anndata_merged = Channel.from(params.file__anndata_merged) celltype(file__anndata_merged) } +workflow JUST_RECLUSTER{ + file__anndata_merged = Channel.from(params.file__anndata_merged) + gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq") + dummy_filtered_channel(file__anndata_merged,params.id_in) + file__cells_filtered = dummy_filtered_channel.out.anndata_metadata + qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets. + +} ////// You do not need to concern about the workflows bellow as these are Cardinal Specific and used for development diff --git a/modules/nf-core/modules/clustering/functions.nf b/modules/nf-core/modules/clustering/functions.nf index 5f6efa32..b54847fb 100755 --- a/modules/nf-core/modules/clustering/functions.nf +++ b/modules/nf-core/modules/clustering/functions.nf @@ -341,7 +341,7 @@ process cluster_validate_resolution_keras { // ------------------------------------------------------------------------ //cache false // cache results from run //maxForks 2 // hard to control memory usage. limit to 3 concurrent - label 'gpu' // use GPU + label 'process_low' // use GPU scratch false // use tmp directory if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://yascp.cog.sanger.ac.uk/public/singularity_images/wtsihgi_nf_scrna_qc_6bb6af5-2021-12-23-3270149cf265.sif" diff --git a/modules/nf-core/modules/clustering/main.nf b/modules/nf-core/modules/clustering/main.nf index ea4e0b2a..2c850608 100755 --- a/modules/nf-core/modules/clustering/main.nf +++ b/modules/nf-core/modules/clustering/main.nf @@ -71,37 +71,34 @@ workflow CLUSTERING { // cluster_validate_resolution__sparsity, // cluster_validate_resolution__train_size_cells // ) - if (params.utilise_gpu){ - if (params.cluster_validate_resolution_keras){ + + if (params.cluster_validate_resolution_keras){ - - cluster_validate_resolution_keras( - cluster.out.outdir, - cluster.out.anndata, - cluster.out.metadata, - cluster.out.pcs, - cluster.out.reduced_dims, - cluster.out.clusters, - cluster_validate_resolution__sparsity, - cluster_validate_resolution__train_size_cells, - cluster.out.outdir__reduced_dims - ) + + cluster_validate_resolution_keras( + cluster.out.outdir, + cluster.out.anndata, + cluster.out.metadata, + cluster.out.pcs, + cluster.out.reduced_dims, + cluster.out.clusters, + cluster_validate_resolution__sparsity, + cluster_validate_resolution__train_size_cells, + cluster.out.outdir__reduced_dims + ) - plot_resolution_validate( - cluster_validate_resolution_keras.out.plot_input.groupTuple() - ) - } - + plot_resolution_validate( + cluster_validate_resolution_keras.out.plot_input.groupTuple() + ) } + + SCCAF(cluster.out.outdir, cluster.out.anndata, cluster.out.clusters, sccaf_minacc) - - - // // Generate UMAPs of the results. umap_calculate_and_plot( cluster.out.outdir, @@ -118,28 +115,29 @@ workflow CLUSTERING { ) dummy_output=umap_calculate_and_plot.out.dummy_output // // Find marker genes for clusters - cluster_markers( - cluster.out.outdir, - cluster.out.anndata, - cluster.out.metadata, - cluster.out.pcs, - cluster.out.reduced_dims, - cluster.out.clusters, - cluster_marker__methods - ) - - // // Find marker genes for clusters using CELLEX - cellex_cluster_markers( - cluster.out.outdir, - cluster.out.anndata - ) + if (params.cluster_markers){ + cluster_markers( + cluster.out.outdir, + cluster.out.anndata, + cluster.out.metadata, + cluster.out.pcs, + cluster.out.reduced_dims, + cluster.out.clusters, + cluster_marker__methods + ) - // Prep adata file for cellxgene website - prep_cellxgene( - cluster.out.outdir, - cluster.out.anndata - ) + // // Find marker genes for clusters using CELLEX + cellex_cluster_markers( + cluster.out.outdir, + cluster.out.anndata + ) + // Prep adata file for cellxgene website + prep_cellxgene( + cluster.out.outdir, + cluster.out.anndata + ) + } emit: dummy_output diff --git a/modules/nf-core/modules/estimate_pca_elbow/main.nf b/modules/nf-core/modules/estimate_pca_elbow/main.nf index 05d3d9ac..2251e3af 100755 --- a/modules/nf-core/modules/estimate_pca_elbow/main.nf +++ b/modules/nf-core/modules/estimate_pca_elbow/main.nf @@ -43,10 +43,7 @@ process ESTIMATE_PCA_ELBOW { script: outdir = "${outdir_prev}" - log.info("""outdir = ${outdir}""") - // from the file__anndata job. outfile = "${file__anndata}".minus(".h5ad") - .split("-").drop(1).join("-") outfile = "${outfile}-knee" """ rm -fr plots diff --git a/modules/nf-core/modules/normalise_and_pca/main.nf b/modules/nf-core/modules/normalise_and_pca/main.nf index b0f81031..696128c9 100755 --- a/modules/nf-core/modules/normalise_and_pca/main.nf +++ b/modules/nf-core/modules/normalise_and_pca/main.nf @@ -3,6 +3,59 @@ def random_hex(n) { Long.toUnsignedString(new Random().nextLong(), n).toUpperCase() } +process PCA { + + label 'process_medium' + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://yascp.cog.sanger.ac.uk/public/singularity_images/nf_qc_scrna_v1.img" + // /software/hgi/containers/nf_qc_scrna_v1.img + } else { + container "mercury/nf_qc_scrna:v1" + } + + publishDir path: "${outdir}", + saveAs: {filename -> filename.replaceAll("-", "")}, + mode: "${params.copy_mode}", + overwrite: "true" + + input: + path(file__anndata) + val(outdir) + val(layer) + + output: + val(outdir, emit: outdir) + val("${outdir}", emit: outdir3) + path("adata-normalized_pca.h5ad", emit: anndata) + path("adata-metadata.tsv.gz", emit: metadata) + path("adata-pcs.tsv.gz", emit: pcs) + path( + "adata-normalized_pca-counts.h5ad", + emit: anndata_filtered_counts + ) + val("${param_details}", emit: param_details) + path("plots/*.pdf") + path("plots/*.png") optional true + + script: + + """ + rm -fr plots + pca_anndata.py \ + --h5_anndata ${file__anndata} \ + --overwrite_x_with_layer ${layer} \ + --output_file adata \ + --number_cpu ${task.cpus} \ + --drop_cell_passes_qc_from_clustering ${params.drop_cell_passes_qc_from_clustering} + mkdir plots + + mv *pdf plots/ 2>/dev/null || true + mv *png plots/ 2>/dev/null || true + """ + +} + + process NORMALISE_AND_PCA { // Takes annData object, nomalizes across samples, calculates PCs. // NOTE: Once normalization is set, it would be faster to normalize per @@ -25,7 +78,7 @@ process NORMALISE_AND_PCA { overwrite: "true" input: - val(outdir_prev) + path(file__anndata) val(analysis_mode) val(layer) @@ -36,16 +89,8 @@ process NORMALISE_AND_PCA { output: val(outdir, emit: outdir) - val("${outdir}", emit: outdir3) - path("adata-normalized_pca.h5ad", emit: anndata) - path("adata-metadata.tsv.gz", emit: metadata) - path("adata-pcs.tsv.gz", emit: pcs) - - path( - "adata-normalized_pca-counts.h5ad", - emit: anndata_filtered_counts - ) + path("adata-normalized.h5ad", emit: anndata) val("${param_details}", emit: param_details) path("plots/*.pdf") path("plots/*.png") optional true @@ -66,19 +111,14 @@ process NORMALISE_AND_PCA { cmd__vars_to_regress = "--vars_to_regress ${vars_to_regress}" } - // todo - mo11 - these paths are confusing - - outdir = "${outdir_prev}/normalize=total_count.${param_details}" + outdir = "${params.outdir}/clustering/normalize=total_count.${param_details}" // Add details on the genes we are exlcuding from hgv list. file_vge = "${file__genes_exclude_hvg.getSimpleName()}" outdir = "${outdir}.hvg_exclude=${file_vge}" // Add details on the scores we are using. file_score = "${file__genes_score.getSimpleName()}" outdir = "${outdir}.scores=${file_score}" - - // this is where the subfolder 1 is determined - // Customize command for optional files. cmd__genes_exclude_hvg = "" if (file__genes_exclude_hvg.name != "no_file__genes_exclude_hvg") { @@ -89,7 +129,6 @@ process NORMALISE_AND_PCA { cmd__genes_score = "--score_genes ${file__genes_score}" } - """ rm -fr plots 0035-scanpy_normalize_pca.py \ @@ -106,26 +145,4 @@ process NORMALISE_AND_PCA { mv *pdf plots/ 2>/dev/null || true mv *png plots/ 2>/dev/null || true """ - // Old version with bash evaluation of optional commands - // - // echo "normalize_pca: ${process_info}" - // # If there are entries in the variable_genes_exclude file, add it to - // # the call. - // cmd__vg_exclude="--variable_genes_exclude ${file__genes_exclude_hvg}" - // val=\$(cat ${file__genes_exclude_hvg} | wc -l) - // if [ \$val -eq 0 ]; then cmd__vg_exclude=""; fi - // # If there are entries in the score_genes file, add it to the call. - // cmd__score_genes="--score_genes ${file__genes_score}" - // val=\$(cat ${file__genes_score} | wc -l) - // if [ \$val -eq 0 ]; then cmd__score_genes=""; fi - // 0035-scanpy_normalize_pca.py \ - // --h5_anndata ${file__anndata} \ - // --output_file adata \ - // --number_cpu ${task.cpus} \ - // ${cmd__vars_to_regress} \ - // \${cmd__vg_exclude} \ - // \${cmd__score_genes} - // mkdir plots - // mv *pdf plots/ 2>/dev/null || true - // mv *png plots/ 2>/dev/null || true } diff --git a/subworkflows/qc.nf b/subworkflows/qc.nf index 9ea2ff74..2d88024f 100755 --- a/subworkflows/qc.nf +++ b/subworkflows/qc.nf @@ -5,7 +5,7 @@ include {OUTLIER_FILTER} from "$projectDir/modules/nf-core/modules/outlier_filte include {PLOT_STATS} from "$projectDir/modules/nf-core/modules/plot_stats/main" include {ESTIMATE_PCA_ELBOW} from "$projectDir/modules/nf-core/modules/estimate_pca_elbow/main" include {SUBSET_PCS} from "$projectDir/modules/nf-core/modules/subset_pcs/main" -include {NORMALISE_AND_PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main" +include {NORMALISE_AND_PCA; PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main" include {HARMONY} from "$projectDir/modules/nf-core/modules/harmony/main" include {BBKNN} from "$projectDir/modules/nf-core/modules/bbknn/main" include {ADD_EXTRA_METADATA_TO_H5AD} from "$projectDir/modules/nf-core/modules/adata_manipulations/main" @@ -19,7 +19,7 @@ workflow qc { take: file__anndata_merged file__cells_filtered - assignments_all_pools + gt_outlier_input main: log.info "--- Running QC metrics --- " // if(params.extra_metadata!=''){ @@ -29,26 +29,14 @@ workflow qc { // }else{ // log.info '''--- No extra metadata to add to h5ad ---''' // } - file__anndata_merged.map{val1 -> tuple('full', val1)}.set{out1} CELL_HARD_FILTERS(file__anndata_merged,params.hard_filters_file,params.hard_filters_drop) if(params.hard_filters_file != "no_file__file_sample_qc"){ file__anndata_merged = CELL_HARD_FILTERS.out.anndata } - // Next we define an input channel to outlier filtering strategy in case if params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !='' - // i.e - if we want to exclude a particular cohort that has been matched by gt match from the adaptive qc we feed this in the outlier_filter() - if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){ - gt_outlier_input = assignments_all_pools - }else{ - gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq") - } - - file__anndata_merged.subscribe { println "value1: $it" } - file__cells_filtered.subscribe { println "value2: $it" } - gt_outlier_input.subscribe { println "value3: $it" } //FILTERING OUTLIER CELLS - if (params.sample_qc.cell_filters.filter_outliers.run_process) { + if (params.filter_outliers) { log.info """---Running automatic outlier cell filtering.----""" OUTLIER_FILTER( params.outdir, @@ -67,18 +55,28 @@ workflow qc { } + if (params.normalise_andata){ + NORMALISE_AND_PCA( + file__anndata_merged, + params.mode, + params.layer, + params.genes_exclude_hvg, + params.genes_score, + params.reduced_dims.vars_to_regress.value) + andata = NORMALISE_AND_PCA.out.anndata + outdir = NORMALISE_AND_PCA.out.outdir + + }else{ + andata = file__anndata_merged + outdir = "${params.outdir}" + LI4 = Channel.of([1, 'dummy_lisi']) + } - NORMALISE_AND_PCA(params.outdir+'/clustering', - file__anndata_merged, - params.mode, - params.layer, - params.genes_exclude_hvg, - params.genes_score, - params.reduced_dims.vars_to_regress.value) + PCA(andata,params.outdir,params.layer) ESTIMATE_PCA_ELBOW( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, + PCA.out.outdir, + PCA.out.anndata, params.reduced_dims.n_dims.add_n_to_estimate ) @@ -91,21 +89,21 @@ workflow qc { } SUBSET_PCS( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, - NORMALISE_AND_PCA.out.metadata, - NORMALISE_AND_PCA.out.pcs, - NORMALISE_AND_PCA.out.param_details, + PCA.out.outdir, + PCA.out.anndata, + PCA.out.metadata, + PCA.out.pcs, + PCA.out.param_details, n_pcs ) - file__anndata_merged.subscribe { println "PLOT_STATS input: $it" } + PCA.out.outdir.subscribe { println "outdir input: $it" } PLOT_STATS(file__anndata_merged, file__cells_filtered, SUBSET_PCS.out.outdir, SUBSET_PCS.out.anndata, n_pcs) - file__anndata_merged = NORMALISE_AND_PCA.out.anndata + file__anndata_merged = PCA.out.anndata LI4 = PLOT_STATS.out.LI @@ -120,11 +118,11 @@ workflow qc { // "Correct" PCs using Harmony or BBKNN if (params.harmony.run_process) { HARMONY( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, - NORMALISE_AND_PCA.out.metadata, - NORMALISE_AND_PCA.out.pcs, - NORMALISE_AND_PCA.out.param_details, + PCA.out.outdir, + PCA.out.anndata, + PCA.out.metadata, + PCA.out.pcs, + PCA.out.param_details, n_pcs, Channel.fromList( params.harmony.variables_and_thetas.value) ) @@ -187,11 +185,11 @@ workflow qc { if (params.bbknn.run_process) { BBKNN( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, - NORMALISE_AND_PCA.out.metadata, - NORMALISE_AND_PCA.out.pcs, - NORMALISE_AND_PCA.out.param_details, + PCA.out.outdir, + PCA.out.anndata, + PCA.out.metadata, + PCA.out.pcs, + PCA.out.param_details, n_pcs, params.bbknn.batch_variable.value ) @@ -256,8 +254,8 @@ workflow qc { lisi_input_second = lisi_input_first.mix(lisi_input3) LISI( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.metadata, + PCA.out.outdir, + PCA.out.metadata, params.lisi.variables.value, lisi_input_second.collect() ) diff --git a/workflows/yascp.nf b/workflows/yascp.nf index 6e425d2f..f3009565 100755 --- a/workflows/yascp.nf +++ b/workflows/yascp.nf @@ -57,7 +57,7 @@ workflow YASCP { if(!params.just_reports){ // sometimes we just want to rerun report generation as a result of alterations, hence if we set params.just_reports =True pipeline will use the results directory and generate a new reports. - if (!params.skip_preprocessing.value){ + if (!params.skip_preprocessing){ // The input table should contain the folowing columns - experiment_id n_pooled donor_vcf_ids data_path_10x_format // prepearing the inputs from a standard 10x dataset folders. prepare_inputs(input_channel) @@ -125,23 +125,23 @@ workflow YASCP { }else{ // This option skips all the deconvolution and and takes a preprocessed yascp h5ad file to run the downstream clustering and celltype annotation. log.info '''----Skipping Preprocessing since we already have prepeared h5ad input file----''' - file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged) + file__anndata_merged = Channel.from(params.file__anndata_merged) if("${mode}"!='default'){ // Here we have rerun GT matching upstream - done for freeze1 assignments_all_pools = mode }else{ - if (params.skip_preprocessing.file__anndata_merged !=''){ - assignments_all_pools = Channel.from(params.skip_preprocessing.gt_match_file) + if (params.file__anndata_merged !=''){ + assignments_all_pools = Channel.from(params.gt_match_file) }else{ assignments_all_pools = Channel.from("$projectDir/assets/fake_file.fq") } } - if (params.skip_preprocessing.file__cells_filtered ==''){ + if (params.file__cells_filtered ==''){ log.info '''--- No cells filtered input ----''' - dummy_filtered_channel(file__anndata_merged,params.skip_preprocessing.id_in) + dummy_filtered_channel(file__anndata_merged,params.id_in) file__cells_filtered = dummy_filtered_channel.out.anndata_metadata }else{ file__cells_filtered = Channel.from(params.skip_preprocessing.file__cells_filtered) @@ -175,7 +175,14 @@ workflow YASCP { // ################################### if (!params.skip_qc){ - qc(file__anndata_merged,file__cells_filtered,assignments_all_pools) //This runs the Clusterring and qc assessments of the datasets. + + if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){ + gt_outlier_input = assignments_all_pools + }else{ + gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq") + } + + qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets. process_finish_check_channel = qc.out.LI file__anndata_merged = qc.out.file__anndata_merged }else{ From edca4bf42bfd3dd667cc5b849bbb9eefccaa0712 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Thu, 30 Nov 2023 08:50:38 +0000 Subject: [PATCH 7/7] tested --- workflows/yascp.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/yascp.nf b/workflows/yascp.nf index f3009565..1211ba2a 100755 --- a/workflows/yascp.nf +++ b/workflows/yascp.nf @@ -176,7 +176,7 @@ workflow YASCP { if (!params.skip_qc){ - if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){ + if(params.gt_match_based_adaptive_qc_exclusion_pattern !=''){ gt_outlier_input = assignments_all_pools }else{ gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")