From 140140634e2b8af0285123f38f4abbee116a4030 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Tue, 3 Dec 2024 13:29:10 +0000 Subject: [PATCH 1/2] cardinal F3 finalised with this merge --- bin/cohort_report.py | 10 +- bin/gather_minimal_dataset.py | 338 ++++++++++++-------- conf/base.conf | 5 + conf/modules.conf | 2 +- main.nf | 3 + modules/nf-core/modules/gather_data/main.nf | 8 +- subworkflows/data_handover.nf | 8 +- 7 files changed, 231 insertions(+), 143 deletions(-) diff --git a/bin/cohort_report.py b/bin/cohort_report.py index 9ee64a0c..a31fccb5 100755 --- a/bin/cohort_report.py +++ b/bin/cohort_report.py @@ -58,7 +58,7 @@ # Split Donor Report by cohort - Donor_Report2 = Donor_Report.set_index('Pool_ID.Donor_Id') + Donor_Report2 = Donor_Report.set_index(Donor_Report['Pool_ID.Donor_Id']) try: Donor_Report2.insert(0,'Vacutainer ID','NONE') @@ -176,7 +176,10 @@ def Generate_Report(GT_MATCH_CONFIDENT,pan): return Total_Report GT_MATCH['final_panel']=GT_MATCH['final_panel'].str.replace("_[1-9][0-9]*$","", regex=True) + GT_MATCH['donor_gt'] = GT_MATCH['donor_gt'].astype(str) + GT_MATCH['donor_gt original'] = GT_MATCH['donor_gt'].astype(str) for confident_panel in set(GT_MATCH['final_panel']): + print(confident_panel) if (confident_panel!='NONE' and confident_panel!='GT_cell_lines'): print(confident_panel) pan=confident_panel.replace('GT_','') @@ -214,7 +217,10 @@ def Generate_Report(GT_MATCH_CONFIDENT,pan): Donor_Report2.loc[Total_Report2.index,'viability']=Total_Report2.loc[Total_Report2.index,'viability'] Donor_Report2.loc[Total_Report2.index,'Match Expected']=Total_Report2.loc[Total_Report2.index,'Match Expected'] Donor_Report2.loc[Total_Report2.index,'Sequencing time']=Total_Report2.loc[Total_Report2.index,'Sequencing time'] - Donor_Report2 = Donor_Report2.reset_index().set_index('Pool ID') + try: + Donor_Report2 = Donor_Report2.reset_index().set_index('Pool ID') + except: + Donor_Report2 = Donor_Report2.set_index('Pool ID') GT_MATCH2 = GT_MATCH.set_index('pool') GT_MATCH2=GT_MATCH2[['Emergency_ids expected','Good_ids expected']] diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py index e3f88f35..a16c766d 100755 --- a/bin/gather_minimal_dataset.py +++ b/bin/gather_minimal_dataset.py @@ -14,6 +14,7 @@ import pandas import anndata import re +import scanpy as sc import os os.environ['NUMBA_CACHE_DIR']='/tmp' os.environ['MPLCONFIGDIR']='/tmp' @@ -45,12 +46,24 @@ SCRUBLET_ASSIGNMENTS_FNSUFFIX = 'scrublet.tsv' COLUMNS_AZIMUTH = { + + 'Azimuth:L0_predicted.celltype.l2': 'l2.mapped.azimuth.celltyp.l0', + 'Azimuth:L1_predicted.celltype.l2': 'l2.mapped.azimuth.celltyp.l1', + 'Azimuth:mapping.score.celltype.l1': 'azimuth.pred.score.l1', + 'Azimuth:mapping.score.celltype.l2': 'azimuth.pred.score.l2', + 'Azimuth:mapping.score.celltype.l3': 'azimuth.pred.score.l3', + 'Azimuth:predicted.celltype.l1': 'azimuth.celltyp.l1', + 'Azimuth:predicted.celltype.l1.score': 'azimuth.pred.score.l1', 'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2', + 'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2', 'Azimuth:predicted.celltype.l3': 'azimuth.celltyp.l3', - 'Azimuth:predicted.celltype.l1': 'azimuth.celltyp.l1', + 'Azimuth:predicted.celltype.l3.score': 'azimuth.pred.score.l3', + 'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2', + 'Azimuth:predicted.celltype.l3': 'azimuth.celltyp.l3', + 'Azimuth:L1_predicted.celltype.l2': 'azimuth.celltyp.l1', 'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2', 'Azimuth:mapping.score.celltype.l2': 'azimuth.map.score', - 'Celltypist*':'Celltypist*', + 'Celltypist*':'Celltypist*' } COLUMNS_DECONV = { @@ -59,8 +72,8 @@ 'prob_doublet': 'vireo.prob.doublet' } COLUMNS_QC = { - 'cell_passes_hard_filters': 'cell_passes_hard_filters', - 'cell_passes_qc': 'qc.filter.pass', + 'cell_passes_hard_filters': 'cell_passes_hard_filters', 'State':'State','Donor id':'Donor id','Vacutainer ID':'Vacutainer ID', 'live_cell_count':'live_cell_count', 'viability':'viability', 'SITE':'SITE', 'GEM_BATCH':'GEM_BATCH', 'Gender':'Gender', 'DRAW_DATE':'DRAW_DATE', 'customer_measured_volume':'customer_measured_volume', + 'cell_passes_qc': 'qc.filter.pass',"id_pool_lims":'lims.pool.id','chromium_channel':'chromium.run.id', 'cell_passes_qc:score':'qc.filter.pass:score', 'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude', 'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score', @@ -261,36 +274,17 @@ def load_scrublet_assignments(expid, datadir_scrublet): scb = pandas.read_table(filpath).set_index('cell_barcode', drop = True) return scb -def fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = None,Resolution='0pt5'): - - s = adqc.obs['convoluted_samplename'] == expid - ad = adqc[s] - df = get_df_from_mangled_index(ad.obs, expid) - #df.insert(0, 'barcode', df.index.values) - df_pre = pandas.concat([df,ad.obs], axis = 1) - df_pre['mengled_index'] = df_pre.index - df = df_pre.set_index("barcode", drop = True) - - if cell_bender_path is not None: - # cellbender removes the barcodes - - dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution) - dc = pandas.concat([df, dfcb], axis = 1, join = 'inner') - if dc.shape[0] != df.shape[0]: - sys.exit("ERROR: barcodes missing in cellbender file.") - df = dc.copy() - dc=dc.set_index('mengled_index') - - ad.obs['cellbender_latent_probability']=dc['cellbender_latent_probability'] - return df,ad - def fetch_cellbender_annotation(dirpath, expid,Resolution): try: h5_path = f"{args.results_dir}/{os.path.dirname(dirpath)}/cellbender_FPR_{Resolution}_filtered.h5" f = h5py.File(h5_path, 'r') except: - - h5_path = glob.glob(f"{os.path.dirname(dirpath)}/cellbender*{Resolution}_filtered.h5")[0] + try: + h5_path = glob.glob(f"{os.path.dirname(dirpath)}/cellbender*{Resolution}_filtered.h5")[0] + except: + Resolution=Resolution.replace('pt','.') + h5_path = glob.glob(f"{os.path.dirname(dirpath)}/cellbender*{Resolution}_filtered.h5")[0] f = h5py.File(h5_path, 'r') # ad = scanpy.read_10x_h5(h5_path, genome='background_removed') # interesting data is in /matrix/barcodes and matrix/latent_cell_probability @@ -333,7 +327,7 @@ def write_subsample(adat, oufnam, sample_size_perc = DEFAULT_SAMPLE_SIZE_PERC): ad.obs.to_csv(oufnam + '.tsv', sep = "\t", na_rep = "N/A") return -def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_output = COLUMNS_OUTPUT,outdir = os.curdir,oufh = sys.stdout,lane_id=1): +def gather_donor(donor_id, ad, ad_lane_raw, qc_obs, columns_output = COLUMNS_OUTPUT,outdir = os.curdir,oufh = sys.stdout,lane_id=1): oufnam = "{}.{}".format(expid, donor_id) sys.stderr.write("processing {} {} ...\n".format(expid, donor_id)) @@ -341,45 +335,30 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu oufh.write("{}\t{}\t{}.h5ad\t{}.tsv\n".format(expid, donor_id, oufnam, oufnam)) # loading deconvoluted dataset - - + try: + ad=ad.to_memory() + except: + _='already loaded' ad.var.index.name = "ensembl_id" ad.raw = ad_lane_raw[ad.obs.index, :] if donor_id != "unassigned" and donor_id != "doublet": - # add annotation from QC - donor_azt = azimuth_annot.loc[azimuth_annot.Donor == donor_id] - donor_azt = donor_azt.loc[donor_azt.Exp == expid] - donor_azt['barcode'] = donor_azt.index.str.split('-').str[0]+'-'+donor_azt.index.str.split('-').str[1] - donor_azt = donor_azt.set_index('barcode') - df = pandas.concat([ad.obs, donor_azt], axis = 1, join = 'outer') - df =df.loc[:,~df.columns.duplicated()] - df = df[['experiment_id'] + - list(set(df.columns).intersection(set(COLUMNS_DECONV.keys()))) + - list(set(df.columns).intersection(set(COLUMNS_AZIMUTH.keys())))] + + dt = qc_obs[qc_obs.donor == donor_id] try: - df = get_lane_and_runid_from_experiment_id(df, insert_pos = 1) + dt = dt.set_index(dt['barcode']) except: - # here we do not know the lane ID. - df['chromium_lane']=lane_id - df['chromium_run_id']=df['experiment_id'][0] - - - dfqc = qc_obs[qc_obs.donor == donor_id] - dt = pandas.concat([df,dfqc], axis = 1, join = 'inner') - + _='barcode is already the index' + df = pandas.concat([ad.obs, dt], axis = 1, join = 'outer') + df =df.loc[:,~df.columns.duplicated(keep='last')] + df['tranche.id']=args.experiment_name colnams = list(columns_output.keys()) - colnams_overlap = sorted(set(colnams).intersection(set(dt.columns))) - ad.obs = dt[colnams_overlap].rename(columns = columns_output) - dt = pandas.concat([df, dfqc], axis = 1, join = 'outer')[colnams_overlap] - dt.rename(columns = columns_output, inplace = True) - + colnams_overlap = sorted(set(colnams).intersection(set(df.columns))) + ad.obs = df[colnams_overlap].rename(columns = columns_output) + dt=df[colnams_overlap].rename(columns = columns_output) # Stats print('Performing the stats analysis') - experiment_id = list(set(df.experiment_id))[0] - try: - pool_id = list(set(df.chromium_run_id))[0] #??? - except: - pool_id=' ' + experiment_id = args.experiment_name + pool_id = list(set(df.experiment_id))[0] try: chromium_channel_number = list(set(df.chromium_channel))[0] except: @@ -422,7 +401,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane except: print('dir exists') - ###################### #Cellranger datasets ###################### @@ -435,7 +413,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane os.symlink(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix.h5", f"./{outdir}/Cellranger_raw_feature_bc_matrix__{expid}.h5") except: print('File already linked') - except: adata_cellranger_raw = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix") try: @@ -448,6 +425,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane # Reading filtered cellranger files adata_cellranger_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix") + try: if write_h5: os.symlink(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix.h5",f"{outdir}/Cellranger_filtered_feature_bc_matrix__{expid}.h5") @@ -468,30 +446,33 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane df_cellbender = df_cellbender.reset_index() df_cellbender = df_cellbender.drop_duplicates(subset=['experiment_id']) df_cellbender = df_cellbender.set_index('experiment_id') + df2 = glob.glob(f'{args.results_dir}/data_modalities_split/preprocess/{expid}/Gene_Expression-{expid}.h5ad')[0] + f=df_cellbender.loc[expid, 'data_path_10x_format'] if (type(f) == str): f=[f] for id in f: print(id) - try: - cell_bender_path = id - cellbender_h5 = f"{cell_bender_path}/../cellbender_FPR_{Resolution}_filtered.h5" - ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path) - print('loaded') - break - except: - print('failed') - continue + cell_bender_path = id + try: + ad_lane_filtered = anndata.read_h5ad(df2) + if write_h5: + try: + path2 = os.path.realpath(df2) + os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5ad") + except: + print('File already linked') + except: + ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path) + if write_h5: + ad_lane_filtered.write( + f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5ad" , + compression='gzip' + ) - if write_h5: - try: - path2 = os.path.realpath(cellbender_h5) - os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5") - except: - print('File already linked') columns_output = {**columns_output, **COLUMNS_CELLBENDER} else: - ad_lane_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix") + ad_lane_filtered = adata_cellranger_filtered df_cellbender=None cell_bender_path=None @@ -508,6 +489,41 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane metrics = pd.read_csv(df_raw.loc[expid, 'data_path_10x_format']+'/metrics_summary.csv') + try: + Donor_Cohort_Assignments = pd.read_csv(f'{args.results_dir}/gtmatch/{expid}/{expid}_gt_donor_assignments.csv') + All_assignments = pd.read_csv(f'{args.results_dir}/gtmatch/assignments_all_pools.tsv',sep='\t') + All_assignments = All_assignments.set_index(All_assignments['pool']+'__'+All_assignments['donor_gt'].astype(str).str.replace('^0*', '', regex=True).str.replace('.*THP1.*', 'THP1', regex=True).str.replace('.*U937.*', 'U937', regex=True)) + All_assignments['tp2'] = All_assignments.index + all_dubs = list(set(All_assignments[All_assignments['tp2'].duplicated()].index)) + all_dubs2 = All_assignments.loc[all_dubs] + All_assignments = All_assignments.drop_duplicates(subset=['tp2'],keep=False) + except: + Donor_Cohort_Assignments = pd.DataFrame(columns=['panel']) + All_assignments= pd.DataFrame(columns=['panel']) + + if (args.extra_meta): + Metadata = pd.read_csv(f"{args.extra_meta}",sep='\t') + Metadata = Metadata.set_index(Metadata['Pool ID']+'__'+Metadata['donor'].astype(str).str.replace('^0*', '', regex=True).str.replace('.*THP1.*', 'THP1', regex=True).str.replace('.*U937.*', 'U937', regex=True)) + Metadata['donor2']=All_assignments['donor_query'] + Metadata = Metadata.set_index(Metadata['Pool ID']+'__'+Metadata['donor2']) + + if len(all_dubs2)>0: + for i,n in all_dubs2.iterrows(): + print(i) + new =Metadata[Metadata['experiment_id']==i] + new['donor2']=n['donor_query'] + Metadata = Metadata.append(new, ignore_index=True) + Metadata = Metadata.drop_duplicates() + Metadata = Metadata.set_index(Metadata['Pool ID']+'__'+Metadata['donor2']) + Metadata['tp2'] = Metadata.index + Metadata = Metadata.drop_duplicates(subset=['tp2'],keep=False) + Metadata = Metadata.drop('tp2',axis=1) + Metadata = Metadata.drop('donor2',axis=1) + Metadata = Metadata.drop('donor',axis=1) + Metadata = Metadata.drop('experiment_id',axis=1) + else: + Metadata = pd.DataFrame() + ############# #Cell-type assignments ############# @@ -545,30 +561,31 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane columns_output = {**columns_output, **COLUMNS_SCRUBLET} - - # doublet_data_combined.iloc[0] - # datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0] - # if os.path.isdir(datadir_scrublet): - # # Scrublet loading QC - # try: - # scb = load_scrublet_assignments( - # expid, - # datadir_scrublet=datadir_scrublet - # ) - # columns_output = {**columns_output, **COLUMNS_SCRUBLET} - # scb = pd.concat([scb,doublet_data_combined.loc[scb.index]],axis=1) - # except: - # print('Scrubblet was not performed for this pool - potential reason is that there are not enough cells for assignment') - # scb = None - # else: - # scb = None - - + columns_output = {k: v for k, v in columns_output.items() if 'idx1' not in k and 'idx' not in v} + ############################################################ # Loading deconvoluted data including unassigned and doublets ########################################### - obsqc,all_QC_lane = fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = cell_bender_path,Resolution=Resolution) + s = adqc.obs['convoluted_samplename'] == expid + ad = adqc[s] + df = get_df_from_mangled_index(ad.obs, expid) + #df.insert(0, 'barcode', df.index.values) + df_pre = pandas.concat([df,ad.obs], axis = 1) + df_pre['mengled_index'] = df_pre.index + df = df_pre.set_index("barcode", drop = True) + + if cell_bender_path is not None: + # cellbender removes the barcodes - + dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution) + dc = pandas.concat([df, dfcb], axis = 1, join = 'inner') + if dc.shape[0] != df.shape[0]: + sys.exit("ERROR: barcodes missing in cellbender file.") + df = dc.copy() + dc=dc.set_index('mengled_index') + + obsqc = df + all_QC_lane = ad donor_tables=pd.DataFrame([]) for d1 in set(obsqc['donor']): @@ -584,8 +601,20 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane df_donors = donor_tables df_donors = df_donors.reset_index(drop=True) if scb is not None: - obsqc = pandas.concat([obsqc,scb], axis = 1, join = 'outer') - + obsqc = pandas.concat([obsqc,scb], axis = 1, join = 'inner') + + if len(Metadata)>0: + obsqc['barcode']= obsqc.index + obsqc = obsqc.set_index(obsqc['convoluted_samplename'].astype(str)+'__'+obsqc['donor_id'].astype(str)) + obsqc['chromium_channel']=Metadata['chromium_channel'] + obsqc.update(Metadata) + for col in Metadata.columns: + if col not in obsqc.columns: + obsqc[col] = Metadata[col] + if len(All_assignments)>0: + All_assignments = All_assignments.set_index(All_assignments['pool']+'__'+All_assignments['donor_query']) + obsqc['Donor id']=All_assignments['donor_gt original'] + obsqc['Vacutainer ID']=All_assignments['donor_gt'] fctr = 0 ##################### @@ -677,7 +706,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane try: Valid_Droplet_percentage = metrics['Valid Barcodes'].values[0] except: - Valid_Droplet_percentage = metrics['Valid barcodes'].values[0] + _='metric not available' df1 = ad_lane_filtered.to_df() Number_of_cells = len(set(df1.index)) Total_UMIs_before_10x_filter = np.sum(ad_lane_raw.X) #this may be after the normalisation @@ -697,11 +726,8 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane UMIS_mapped_to_ribo_genes = sum(all_QC_lane.obs['total_counts_gene_group__ribo_protein']) UMIS_mapped_to_ribo_rna = sum(all_QC_lane.obs['total_counts_gene_group__ribo_rna']) UMIs_mapped_to_genes = sum(all_QC_lane.obs['total_counts']) - try: - Donor_Cohort_Assignments = pd.read_csv(f'{args.results_dir}/gtmatch/{expid}/{expid}_gt_donor_assignments.csv') - except: - Donor_Cohort_Assignments = pd.DataFrame(columns=['panel']) - + + Total_donors_deconvoluted_in_pool = len(Donor_Cohort_Assignments[Donor_Cohort_Assignments['panel']!='NONE'].index) ELGH_Donors_Deconvoluted_in_pool = len(Donor_Cohort_Assignments[Donor_Cohort_Assignments['panel']=='GT_ELGH'].index) UKB_Donors_Deconvoluted_in_pool = len(Donor_Cohort_Assignments[Donor_Cohort_Assignments['panel']=='GT_UKBB'].index) @@ -715,6 +741,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane 'cells passing QC':[], } all_probs = pd.DataFrame() + all_probs = pd.DataFrame() Tranche_Pass_Fail='PASS' Tranche_Failure_Reason =' ' @@ -731,14 +758,29 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane except: _='cant validate reasoning' + Summary_check = pd.read_csv(f'{args.results_dir}/deconvolution/vireo/{expid}/vireo_{expid}/summary.tsv',sep='\t') + try: + Doublets_donor = int(Summary_check[Summary_check['Var1']=='doublet']['Freq'].values[0]) + except: + Doublets_donor = 0 + try: + Unassigned_donor = int(Summary_check[Summary_check['Var1']=='unassigned']['Freq'].values[0]) + except: + Unassigned_donor = 0 for i in df_donors.index: print(i) + # if df_donors.loc[i]["donor_id"] != 'donor2': + # continue + # else: + # _='' Donor_Stats=[] row = df_donors.loc[i] print(row) path1 = row['file_path_h5ad'] if(path1=='all_QC_lane'): - Deconvoluted_Donor_Data = all_QC_lane[all_QC_lane.obs['donor_id']==row['donor_id']] + tp1 = pd.DataFrame(all_QC_lane.obs) + tp1 = tp1[tp1['donor_id']==row['donor_id']].index + Deconvoluted_Donor_Data = adqc[tp1] Donor_barcodes = Deconvoluted_Donor_Data.obs.index.str.split('-').str[:2] Donor_barcodes = Donor_barcodes.str[0]+'-'+Donor_barcodes.str[1] Deconvoluted_Donor_Data.obs.index = Donor_barcodes @@ -771,11 +813,11 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane intersect_set = set(Donor_barcodes).intersection(set(ad_lane_filtered.obs.index)) all_probs = pd.concat([all_probs,pd.DataFrame(Deconvoluted_Donor_Data.obs['prob_doublet'])]) - Donor_qc_files = all_QC_lane[Mengled_barcodes_donor] + Donor_qc_files = Deconvoluted_Donor_Data UMIs = np.sum(Donor_qc_files.X) - Donor_cells_for_donor=len(all_QC_lane[Mengled_barcodes_donor].obs) - Donor_cells_passes_qc = len(all_QC_lane[Mengled_barcodes_donor].obs[all_QC_lane.obs['cell_passes_qc']]) - Donor_cells_fails_qc = len(all_QC_lane[Mengled_barcodes_donor].obs[all_QC_lane.obs['cell_passes_qc']==False]) + Donor_cells_for_donor=len(Deconvoluted_Donor_Data.obs) + Donor_cells_passes_qc = len(Deconvoluted_Donor_Data.obs[Deconvoluted_Donor_Data.obs['cell_passes_qc']]) + Donor_cells_fails_qc = len(Deconvoluted_Donor_Data.obs[Deconvoluted_Donor_Data.obs['cell_passes_qc']==False]) data_donor_for_stats['cells before QC filters'].append(Donor_cells_for_donor) data_donor_for_stats['cells failing QC'].append(Donor_cells_fails_qc) @@ -801,10 +843,9 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane Donor_Stats = gather_donor( - row["donor_id"], - Deconvoluted_Donor_Data, - ad_lane_raw, - azimuth_annot = azt, + donor_id=row["donor_id"], + ad= Donor_qc_files, + ad_lane_raw=ad_lane_raw, qc_obs = obsqc, columns_output = columns_output, outdir = outdir, @@ -975,6 +1016,9 @@ def set_argument_parser(): parser.add_argument("--experiment_name", required = False,default='default', help="Cellbender resolution used", dest='experiment_name') + parser.add_argument("--extra_meta", required = False,default=None, + help="Extra Meta To merge in final files", + dest='extra_meta') return parser.parse_args() @@ -1008,15 +1052,40 @@ def set_argument_parser(): Resolution = args.resolution # Load the final QCd dataset + a1 = glob.glob(f'{args.results_dir}/*/*/*outlier_filtered_adata.h5ad') + a2 =glob.glob(f'{args.results_dir}/*/*outlier_filtered_adata.h5ad') + a3 =glob.glob(f'{args.results_dir}/*outlier_filtered_adata.h5ad') + all_inter = list(set(a3).union(set(a2)).union(set(a1)))[0] + adqc = anndata.read_h5ad(all_inter, backed='r') + + if 'log1p_cp10k' not in adqc.layers: + adqc = adqc.to_memory() + normalized_counts = sc.pp.normalize_total( + adqc, + target_sum=1e4, + exclude_highly_expressed=False, + key_added='normalization_factor', # add to adata.obs + inplace=False + )['X'] + log1p_cp10k = np.log1p(normalized_counts) + adqc.layers['log1p_cp10k'] = log1p_cp10k + adqc.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'} + del normalized_counts + del log1p_cp10k + try: - adqc = anndata.read_h5ad(f'{args.results_dir}/merged_h5ad/outlier_filtered_adata.h5ad') + a1 = glob.glob(f'{args.results_dir}/*/*/*adata-normalized_*.h5ad') + a2 =glob.glob(f'{args.results_dir}/*/*adata-normalized_*.h5ad') + a3 =glob.glob(f'{args.results_dir}/*adata-normalized_*.h5ad') + all_inter = list(set(a3).union(set(a2)).union(set(a1)))[0] + adqc_norm = anndata.read_h5ad(all_inter, backed='r') + # Here we want to add any metadata columns that may be missed for donors. + adqc.obs['phase'] = adqc_norm.obs['phase'] + adqc.obs['S_score'] = adqc_norm.obs['S_score'] + adqc.obs['G2M_score'] = adqc_norm.obs['G2M_score'] except: - try: - adqc = anndata.read_h5ad(f'{args.results_dir}/adata.h5ad') - except: - d2 = glob.glob(f'{args.results_dir}/*/*/adatanormalized.h5ad')[0] - adqc = anndata.read_h5ad(d2) - # adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0] + _='not normalised' + fctr = 0 data_tranche_all=[] data_donor_all=[] @@ -1025,33 +1094,30 @@ def set_argument_parser(): Sample_metadata = pd.DataFrame() # SETTING TRANCHE NAME - try: - adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore') - except: - _='no values associated' - try: - adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore') - except: - _='no values associated' + # try: + # adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore') + # except: + # _='no values associated' + # try: + # adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore') + # except: + # _='no values associated' # Now we calculate all the statistics for each of the pools. for expid in df_raw.index: print(expid) + # if expid != 'CRD_CMB13637311': + # continue s = adqc.obs['convoluted_samplename'] == expid ad = adqc[s] if ad.n_obs == 0: continue #Here no cells has passed the qc thresholds. nf, data_tranche, data_donor = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution) - # add the stuff to the adata. - - # All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt]) data_tranche_all.append(data_tranche) data_donor_all= data_donor_all+data_donor count += 1 fctr += nf - # except: - # print(f"pool {expid} was ignored as it did not contain deconvoluted donors.") - + Donor_Report = pd.DataFrame(data_donor_all) Tranche_Report = pd.DataFrame(data_tranche_all) diff --git a/conf/base.conf b/conf/base.conf index 94855bc0..92557dc2 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -264,6 +264,11 @@ process { time = 12.h maxRetries = 3 } + withName: PREPROCESS_GENOTYPES{ + cpus = 1 + time = 12.h + maxRetries = 3 + } withName: HARMONY{ cpus = 1 diff --git a/conf/modules.conf b/conf/modules.conf index e9855860..8b403b27 100755 --- a/conf/modules.conf +++ b/conf/modules.conf @@ -292,7 +292,7 @@ process { } withName: DOUBLET_FINDER{ - memory = { 100.GB * task.attempt } + memory = { 20.GB * task.attempt } } diff --git a/main.nf b/main.nf index b5b72839..8c090b03 100755 --- a/main.nf +++ b/main.nf @@ -54,6 +54,9 @@ workflow MAIN { RETRIEVE_RECOURSES_TEST_DATASET(out_ch) input_channel = RETRIEVE_RECOURSES_TEST_DATASET.out.input_channel vcf_inputs = RETRIEVE_RECOURSES_TEST_DATASET.out.vcf_inputs + vcf_inputs.splitCsv(header: true, sep: '\t') + .map { row -> tuple(row.label, file(row.vcf_file_path), file("${row.vcf_file_path}.csi")) } + .set { vcf_inputs } }else{ input_channel = Channel.fromPath(params.input_data_table, followLinks: true, checkIfExists: true) if (params.genotype_input.run_with_genotype_input) { diff --git a/modules/nf-core/modules/gather_data/main.nf b/modules/nf-core/modules/gather_data/main.nf index 6876ed02..dd5ac647 100755 --- a/modules/nf-core/modules/gather_data/main.nf +++ b/modules/nf-core/modules/gather_data/main.nf @@ -32,6 +32,12 @@ process GATHER_DATA{ cellbender_input='cellbender' } + if ("${params.extra_sample_metadata}" != ''){ + extra_meta = "--extra_meta=${params.extra_sample_metadata}" + }else{ + extra_meta = "" + } + """ echo ${dummy_val} gather_minimal_dataset.py \ @@ -41,7 +47,7 @@ process GATHER_DATA{ --cellbender=${cellbender_input} \ --resolution=${params.cellbender_resolution_to_use} \ --write_h5=${params.write_h5} \ - --experiment_name=${params.RUN} + --experiment_name=${params.RUN} ${extra_meta} """ } diff --git a/subworkflows/data_handover.nf b/subworkflows/data_handover.nf index f8c8d60e..a0c9de6f 100755 --- a/subworkflows/data_handover.nf +++ b/subworkflows/data_handover.nf @@ -2,7 +2,7 @@ include { GATHER_DATA; SPLIT_DATA_BY_STUDY} from "$projectDir/modules/nf-core/m include { ENCRYPT_DIR; ENCRYPT_TARGET } from "$projectDir/modules/local/encrypt" include { TRANSFER;SUMMARY_STATISTICS_PLOTS } from "$projectDir/modules/nf-core/modules/summary_statistics_plots/main" include { split_bam_by_donor } from "$projectDir/modules/local/cellranger_bam_per_donor" -include { SUBSET_BAM_PER_BARCODES } from "$projectDir/modules/nf-core/modules/subset_bam_per_barcodes_and_variants/main" +// include { SUBSET_BAM_PER_BARCODES } from "$projectDir/modules/nf-core/modules/subset_bam_per_barcodes_and_variants/main" workflow data_handover{ take: @@ -27,11 +27,13 @@ workflow data_handover{ if (params.split_bam){ // val(sample), path(barcodes), path(bam) + GATHER_DATA.out.barcodes_files.subscribe{ println "barcodes_files: $it" } GATHER_DATA.out.barcodes_files.flatten().map{sample -> tuple("${sample}".replaceFirst(/.*\//,"").replaceFirst(/\..*/,""),"${sample}".replaceFirst(/.*\//,"").replaceFirst(/\.tsv.*/,""),sample)}.set{barcodes} barcodes.combine(sample_possorted_bam_vireo_donor_ids, by: 0).set{full_split_chanel_input} - // full_split_chanel_input.subscribe { println "full_split_chanel_input: $it" } + barcodes.subscribe { println "barcodes: $it" } + full_split_chanel_input.subscribe { println "full_split_chanel_input: $it" } // genome.subscribe { println "genome: $it" } - SUBSET_BAM_PER_BARCODES(full_split_chanel_input,genome) + // SUBSET_BAM_PER_BARCODES(full_split_chanel_input,genome) // split_bam_by_donor(sample_possorted_bam_vireo_donor_ids, genome) // ENCRYPT_TARGET(split_bam_by_donor.out.possorted_cram_files) // cram_encrypted_dirs = ENCRYPT_TARGET.out.encrypted_dir From 158ab1e9c76ed9917154b4d7b3c008c5754c1c60 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Fri, 13 Dec 2024 14:54:17 +0000 Subject: [PATCH 2/2] add --- bin/strip_citeseq.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/bin/strip_citeseq.py b/bin/strip_citeseq.py index c5c0cb58..e7c16360 100755 --- a/bin/strip_citeseq.py +++ b/bin/strip_citeseq.py @@ -22,6 +22,7 @@ import click import logging import os +import re compression_opts = 'gzip' filter_0_count_cells=False @@ -286,6 +287,7 @@ def main(): adata_cellranger_filtered = sc.read_10x_mtx( options.raw_data, var_names='gene_symbols', make_unique=True, cache=False, cache_compression=compression_opts,gex_only=False) + all_feature_types = set(adata_cellranger_filtered.var['feature_types']) for modality1 in set(adata_cellranger_filtered.var.feature_types): # {'Gene Expression', 'Multiplexing Capture', 'Antibody Capture'} @@ -296,16 +298,31 @@ def main(): # adata2 = adata_antibody[adata_antibody.obs_names.difference(zero_count_cells, sort=False)] # if(adata2.shape[0]>0): # # Here we have actually captured some of the reads in the antibody dataset. + if (modality=='Gene_Expression'): adata_antibody.write( f'{modality}-{options.outname}.h5ad', compression='gzip' ) - _ = cellbender_to_tenxmatrix( - adata_antibody, - out_file='', - out_dir=f'{options.outname}__{modality}' - ) + elif (modality=='Multiplexing_Capture'): + all_indexes_multiplexing = set(adata_antibody.var.index).union(set(multiplexing_capure.columns)) + adata_antibody = adata_cellranger_filtered[:,list(all_indexes_multiplexing)] + df = pd.DataFrame(adata_antibody.X.toarray(), index=adata_antibody.obs_names, columns=adata_antibody.var_names) + df.to_csv(f'{options.outname}__{modality}.tsv',sep='\t') + + else: + df = pd.DataFrame(adata_antibody.X.toarray(), index=adata_antibody.obs_names, columns=adata_antibody.var_names) + df.to_csv(f'{options.outname}__{modality}.tsv',sep='\t') + adata_antibody.var.index + + if len(all_feature_types)>1: + _ = cellbender_to_tenxmatrix( + adata_antibody, + out_file='', + out_dir=f'{options.outname}__{modality}' + ) + else: + os.system(f"ln -s {options.raw_data} {options.outname}__{modality}") # adata_gex = adata_cellranger_filtered[:,adata_cellranger_filtered.var.query('feature_types=="Gene Expression"').index] # adata_cellbender = anndata_from_h5('/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/ania/analysis_trego/work/5d/6a30871e864ed7bc03e949ef846a1d/cellbender_FPR_0.1_filtered.h5',