From 140140634e2b8af0285123f38f4abbee116a4030 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Tue, 3 Dec 2024 13:29:10 +0000
Subject: [PATCH 1/2] cardinal F3 finalised with this merge

---
 bin/cohort_report.py                        |  10 +-
 bin/gather_minimal_dataset.py               | 338 ++++++++++++--------
 conf/base.conf                              |   5 +
 conf/modules.conf                           |   2 +-
 main.nf                                     |   3 +
 modules/nf-core/modules/gather_data/main.nf |   8 +-
 subworkflows/data_handover.nf               |   8 +-
 7 files changed, 231 insertions(+), 143 deletions(-)

diff --git a/bin/cohort_report.py b/bin/cohort_report.py
index 9ee64a0c..a31fccb5 100755
--- a/bin/cohort_report.py
+++ b/bin/cohort_report.py
@@ -58,7 +58,7 @@
         # Split Donor Report by cohort
 
 
-        Donor_Report2 = Donor_Report.set_index('Pool_ID.Donor_Id')
+        Donor_Report2 = Donor_Report.set_index(Donor_Report['Pool_ID.Donor_Id'])
         
         try:
             Donor_Report2.insert(0,'Vacutainer ID','NONE')
@@ -176,7 +176,10 @@ def Generate_Report(GT_MATCH_CONFIDENT,pan):
             return Total_Report
 
         GT_MATCH['final_panel']=GT_MATCH['final_panel'].str.replace("_[1-9][0-9]*$","", regex=True)
+        GT_MATCH['donor_gt'] = GT_MATCH['donor_gt'].astype(str)
+        GT_MATCH['donor_gt original'] = GT_MATCH['donor_gt'].astype(str)
         for confident_panel in set(GT_MATCH['final_panel']):
+            print(confident_panel)
             if (confident_panel!='NONE' and confident_panel!='GT_cell_lines'):
                 print(confident_panel)
                 pan=confident_panel.replace('GT_','')
@@ -214,7 +217,10 @@ def Generate_Report(GT_MATCH_CONFIDENT,pan):
                 Donor_Report2.loc[Total_Report2.index,'viability']=Total_Report2.loc[Total_Report2.index,'viability']
                 Donor_Report2.loc[Total_Report2.index,'Match Expected']=Total_Report2.loc[Total_Report2.index,'Match Expected']
                 Donor_Report2.loc[Total_Report2.index,'Sequencing time']=Total_Report2.loc[Total_Report2.index,'Sequencing time']
-                Donor_Report2 = Donor_Report2.reset_index().set_index('Pool ID')
+                try:
+                    Donor_Report2 = Donor_Report2.reset_index().set_index('Pool ID')
+                except:
+                    Donor_Report2 = Donor_Report2.set_index('Pool ID')
 
                 GT_MATCH2 = GT_MATCH.set_index('pool')
                 GT_MATCH2=GT_MATCH2[['Emergency_ids expected','Good_ids expected']]
diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
index e3f88f35..a16c766d 100755
--- a/bin/gather_minimal_dataset.py
+++ b/bin/gather_minimal_dataset.py
@@ -14,6 +14,7 @@
 import pandas
 import anndata
 import re
+import scanpy as sc
 import os
 os.environ['NUMBA_CACHE_DIR']='/tmp'
 os.environ['MPLCONFIGDIR']='/tmp'
@@ -45,12 +46,24 @@
 SCRUBLET_ASSIGNMENTS_FNSUFFIX = 'scrublet.tsv'
 
 COLUMNS_AZIMUTH = {
+    
+    'Azimuth:L0_predicted.celltype.l2': 'l2.mapped.azimuth.celltyp.l0',
+    'Azimuth:L1_predicted.celltype.l2': 'l2.mapped.azimuth.celltyp.l1',
+    'Azimuth:mapping.score.celltype.l1': 'azimuth.pred.score.l1',
+    'Azimuth:mapping.score.celltype.l2': 'azimuth.pred.score.l2',
+    'Azimuth:mapping.score.celltype.l3': 'azimuth.pred.score.l3',
+    'Azimuth:predicted.celltype.l1': 'azimuth.celltyp.l1',
+    'Azimuth:predicted.celltype.l1.score': 'azimuth.pred.score.l1',
     'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2',
+    'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2',
     'Azimuth:predicted.celltype.l3': 'azimuth.celltyp.l3',
-    'Azimuth:predicted.celltype.l1': 'azimuth.celltyp.l1',
+    'Azimuth:predicted.celltype.l3.score': 'azimuth.pred.score.l3',
+    'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2',
+    'Azimuth:predicted.celltype.l3': 'azimuth.celltyp.l3',
+    'Azimuth:L1_predicted.celltype.l2': 'azimuth.celltyp.l1',
     'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2',
     'Azimuth:mapping.score.celltype.l2': 'azimuth.map.score',
-    'Celltypist*':'Celltypist*',
+    'Celltypist*':'Celltypist*'
     }
 
 COLUMNS_DECONV = {
@@ -59,8 +72,8 @@
     'prob_doublet': 'vireo.prob.doublet'
     }
 COLUMNS_QC = {
-    'cell_passes_hard_filters': 'cell_passes_hard_filters',
-    'cell_passes_qc': 'qc.filter.pass',
+    'cell_passes_hard_filters': 'cell_passes_hard_filters', 'State':'State','Donor id':'Donor id','Vacutainer ID':'Vacutainer ID', 'live_cell_count':'live_cell_count', 'viability':'viability', 'SITE':'SITE', 'GEM_BATCH':'GEM_BATCH', 'Gender':'Gender', 'DRAW_DATE':'DRAW_DATE', 'customer_measured_volume':'customer_measured_volume',
+    'cell_passes_qc': 'qc.filter.pass',"id_pool_lims":'lims.pool.id','chromium_channel':'chromium.run.id',
     'cell_passes_qc:score':'qc.filter.pass:score',
     'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude',
     'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score',
@@ -261,36 +274,17 @@ def load_scrublet_assignments(expid, datadir_scrublet):
     scb = pandas.read_table(filpath).set_index('cell_barcode', drop = True)
     return scb
 
-def fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = None,Resolution='0pt5'):
-
-    s = adqc.obs['convoluted_samplename'] == expid
-    ad = adqc[s]
-    df = get_df_from_mangled_index(ad.obs, expid)
-    #df.insert(0, 'barcode', df.index.values)
-    df_pre = pandas.concat([df,ad.obs], axis = 1)
-    df_pre['mengled_index'] = df_pre.index
-    df = df_pre.set_index("barcode", drop = True)
-
-    if cell_bender_path is not None:
-        # cellbender removes the barcodes - 
-        dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution)
-        dc = pandas.concat([df, dfcb], axis = 1, join = 'inner')
-        if dc.shape[0] != df.shape[0]:
-            sys.exit("ERROR: barcodes missing in cellbender file.")
-        df = dc.copy()
-        dc=dc.set_index('mengled_index')
-        
-        ad.obs['cellbender_latent_probability']=dc['cellbender_latent_probability']
-    return df,ad
-
 def fetch_cellbender_annotation(dirpath, expid,Resolution):
     
     try:
         h5_path = f"{args.results_dir}/{os.path.dirname(dirpath)}/cellbender_FPR_{Resolution}_filtered.h5"
         f = h5py.File(h5_path, 'r')
     except:
-        
-        h5_path = glob.glob(f"{os.path.dirname(dirpath)}/cellbender*{Resolution}_filtered.h5")[0]
+        try:
+            h5_path = glob.glob(f"{os.path.dirname(dirpath)}/cellbender*{Resolution}_filtered.h5")[0]
+        except:
+            Resolution=Resolution.replace('pt','.')
+            h5_path = glob.glob(f"{os.path.dirname(dirpath)}/cellbender*{Resolution}_filtered.h5")[0]
         f = h5py.File(h5_path, 'r')
     # ad = scanpy.read_10x_h5(h5_path, genome='background_removed')
     # interesting data is in /matrix/barcodes and matrix/latent_cell_probability
@@ -333,7 +327,7 @@ def write_subsample(adat, oufnam, sample_size_perc = DEFAULT_SAMPLE_SIZE_PERC):
     ad.obs.to_csv(oufnam + '.tsv', sep = "\t", na_rep = "N/A")
     return
 
-def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_output = COLUMNS_OUTPUT,outdir = os.curdir,oufh = sys.stdout,lane_id=1):
+def gather_donor(donor_id, ad, ad_lane_raw, qc_obs, columns_output = COLUMNS_OUTPUT,outdir = os.curdir,oufh = sys.stdout,lane_id=1):
     
     oufnam = "{}.{}".format(expid, donor_id)
     sys.stderr.write("processing {} {} ...\n".format(expid, donor_id))
@@ -341,45 +335,30 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
         oufh.write("{}\t{}\t{}.h5ad\t{}.tsv\n".format(expid, donor_id, oufnam, oufnam))
 
     # loading deconvoluted dataset
-    
-
+    try:
+        ad=ad.to_memory()
+    except:
+        _='already loaded'
     ad.var.index.name = "ensembl_id"
     ad.raw = ad_lane_raw[ad.obs.index, :]
     if donor_id != "unassigned" and donor_id != "doublet":
-        # add annotation from QC
-        donor_azt = azimuth_annot.loc[azimuth_annot.Donor == donor_id]
-        donor_azt = donor_azt.loc[donor_azt.Exp == expid]
-        donor_azt['barcode'] = donor_azt.index.str.split('-').str[0]+'-'+donor_azt.index.str.split('-').str[1]
-        donor_azt = donor_azt.set_index('barcode')
-        df = pandas.concat([ad.obs, donor_azt], axis = 1, join = 'outer')
-        df =df.loc[:,~df.columns.duplicated()]
-        df = df[['experiment_id'] + 
-                list(set(df.columns).intersection(set(COLUMNS_DECONV.keys()))) + 
-                list(set(df.columns).intersection(set(COLUMNS_AZIMUTH.keys())))]
+
+        dt = qc_obs[qc_obs.donor == donor_id]
         try:
-            df = get_lane_and_runid_from_experiment_id(df, insert_pos = 1)
+            dt = dt.set_index(dt['barcode'])
         except:
-            # here we do not know the lane ID.
-            df['chromium_lane']=lane_id
-            df['chromium_run_id']=df['experiment_id'][0]
-
-
-        dfqc = qc_obs[qc_obs.donor == donor_id]
-        dt = pandas.concat([df,dfqc], axis = 1, join = 'inner')
-
+            _='barcode is already the index'
+        df = pandas.concat([ad.obs, dt], axis = 1, join = 'outer')
+        df =df.loc[:,~df.columns.duplicated(keep='last')]
+        df['tranche.id']=args.experiment_name
         colnams = list(columns_output.keys())
-        colnams_overlap = sorted(set(colnams).intersection(set(dt.columns)))
-        ad.obs = dt[colnams_overlap].rename(columns = columns_output)
-        dt = pandas.concat([df, dfqc], axis = 1, join = 'outer')[colnams_overlap]
-        dt.rename(columns = columns_output, inplace = True)
-
+        colnams_overlap = sorted(set(colnams).intersection(set(df.columns)))
+        ad.obs = df[colnams_overlap].rename(columns = columns_output)
+        dt=df[colnams_overlap].rename(columns = columns_output)
         # Stats
         print('Performing the stats analysis')
-        experiment_id = list(set(df.experiment_id))[0]
-        try:
-            pool_id = list(set(df.chromium_run_id))[0] #???
-        except:
-            pool_id=' '
+        experiment_id = args.experiment_name
+        pool_id = list(set(df.experiment_id))[0]
         try:
             chromium_channel_number = list(set(df.chromium_channel))[0]
         except:
@@ -422,7 +401,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     except:
         print('dir exists')
 
-    
     ######################
     #Cellranger datasets
     ######################
@@ -435,7 +413,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
                 os.symlink(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix.h5", f"./{outdir}/Cellranger_raw_feature_bc_matrix__{expid}.h5")
         except:
             print('File already linked')
-    
     except:
         adata_cellranger_raw = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix")
         try:
@@ -448,6 +425,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     # Reading filtered cellranger files
 
     adata_cellranger_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
+
     try:
         if write_h5:
             os.symlink(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix.h5",f"{outdir}/Cellranger_filtered_feature_bc_matrix__{expid}.h5")  
@@ -468,30 +446,33 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
         df_cellbender = df_cellbender.reset_index()
         df_cellbender = df_cellbender.drop_duplicates(subset=['experiment_id'])
         df_cellbender = df_cellbender.set_index('experiment_id')
+        df2 = glob.glob(f'{args.results_dir}/data_modalities_split/preprocess/{expid}/Gene_Expression-{expid}.h5ad')[0]
+
         f=df_cellbender.loc[expid, 'data_path_10x_format']
         if (type(f) == str):
             f=[f]
         for id in f:
             print(id)
-            try:
-                cell_bender_path = id
-                cellbender_h5 = f"{cell_bender_path}/../cellbender_FPR_{Resolution}_filtered.h5"
-                ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path)
-                print('loaded')
-                break
-            except:
-                print('failed')
-                continue
+            cell_bender_path = id
+        try:
+            ad_lane_filtered = anndata.read_h5ad(df2)
+            if write_h5:
+                try:
+                    path2 = os.path.realpath(df2)
+                    os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5ad")
+                except:
+                    print('File already linked')
+        except:
+            ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path)  
+            if write_h5:
+                ad_lane_filtered.write(
+                f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5ad" ,
+                compression='gzip'
+                )      
 
-        if write_h5:
-            try:
-                path2 = os.path.realpath(cellbender_h5)
-                os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5")
-            except:
-                print('File already linked')
         columns_output = {**columns_output, **COLUMNS_CELLBENDER}
     else:
-        ad_lane_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
+        ad_lane_filtered = adata_cellranger_filtered
         df_cellbender=None
         cell_bender_path=None
 
@@ -508,6 +489,41 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     metrics = pd.read_csv(df_raw.loc[expid, 'data_path_10x_format']+'/metrics_summary.csv')
 
     
+    try:
+        Donor_Cohort_Assignments = pd.read_csv(f'{args.results_dir}/gtmatch/{expid}/{expid}_gt_donor_assignments.csv')
+        All_assignments = pd.read_csv(f'{args.results_dir}/gtmatch/assignments_all_pools.tsv',sep='\t')
+        All_assignments = All_assignments.set_index(All_assignments['pool']+'__'+All_assignments['donor_gt'].astype(str).str.replace('^0*', '', regex=True).str.replace('.*THP1.*', 'THP1', regex=True).str.replace('.*U937.*', 'U937', regex=True))
+        All_assignments['tp2'] = All_assignments.index
+        all_dubs =  list(set(All_assignments[All_assignments['tp2'].duplicated()].index))
+        all_dubs2 =  All_assignments.loc[all_dubs]
+        All_assignments = All_assignments.drop_duplicates(subset=['tp2'],keep=False)
+    except:
+        Donor_Cohort_Assignments = pd.DataFrame(columns=['panel'])
+        All_assignments= pd.DataFrame(columns=['panel'])
+
+    if (args.extra_meta):
+        Metadata = pd.read_csv(f"{args.extra_meta}",sep='\t')
+        Metadata = Metadata.set_index(Metadata['Pool ID']+'__'+Metadata['donor'].astype(str).str.replace('^0*', '', regex=True).str.replace('.*THP1.*', 'THP1', regex=True).str.replace('.*U937.*', 'U937', regex=True))
+        Metadata['donor2']=All_assignments['donor_query']
+        Metadata = Metadata.set_index(Metadata['Pool ID']+'__'+Metadata['donor2'])
+        
+        if len(all_dubs2)>0:
+            for i,n in all_dubs2.iterrows():
+                print(i)
+                new =Metadata[Metadata['experiment_id']==i]
+                new['donor2']=n['donor_query']
+                Metadata = Metadata.append(new, ignore_index=True)
+            Metadata = Metadata.drop_duplicates()
+        Metadata = Metadata.set_index(Metadata['Pool ID']+'__'+Metadata['donor2'])
+        Metadata['tp2'] = Metadata.index
+        Metadata = Metadata.drop_duplicates(subset=['tp2'],keep=False)
+        Metadata = Metadata.drop('tp2',axis=1)
+        Metadata = Metadata.drop('donor2',axis=1)
+        Metadata = Metadata.drop('donor',axis=1)
+        Metadata = Metadata.drop('experiment_id',axis=1)
+    else:
+        Metadata = pd.DataFrame()
+
     #############
     #Cell-type assignments
     #############
@@ -545,30 +561,31 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     
 
     columns_output = {**columns_output,  **COLUMNS_SCRUBLET}    
-        
-    # doublet_data_combined.iloc[0]
-    # datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
-    # if os.path.isdir(datadir_scrublet):
-    #     # Scrublet loading QC
-    #     try:
-    #         scb = load_scrublet_assignments(
-    #             expid,
-    #             datadir_scrublet=datadir_scrublet
-    #         )
-    #         columns_output = {**columns_output,  **COLUMNS_SCRUBLET}
-    #         scb = pd.concat([scb,doublet_data_combined.loc[scb.index]],axis=1)
-    #     except:
-    #         print('Scrubblet was not performed for this pool - potential reason is that there are not enough cells for assignment')
-    #         scb = None
-    # else:
-    #     scb = None
-        
-    
+    columns_output = {k: v for k, v in columns_output.items() if 'idx1' not in k and 'idx' not in v}
+
     ############################################################
     # Loading deconvoluted data including unassigned and doublets
     ###########################################
 
-    obsqc,all_QC_lane = fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = cell_bender_path,Resolution=Resolution)
+    s = adqc.obs['convoluted_samplename'] == expid
+    ad = adqc[s]
+    df = get_df_from_mangled_index(ad.obs, expid)
+    #df.insert(0, 'barcode', df.index.values)
+    df_pre = pandas.concat([df,ad.obs], axis = 1)
+    df_pre['mengled_index'] = df_pre.index
+    df = df_pre.set_index("barcode", drop = True)
+
+    if cell_bender_path is not None:
+        # cellbender removes the barcodes - 
+        dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution)
+        dc = pandas.concat([df, dfcb], axis = 1, join = 'inner')
+        if dc.shape[0] != df.shape[0]:
+            sys.exit("ERROR: barcodes missing in cellbender file.")
+        df = dc.copy()
+        dc=dc.set_index('mengled_index')
+        
+    obsqc = df
+    all_QC_lane = ad
 
     donor_tables=pd.DataFrame([])
     for d1 in set(obsqc['donor']):
@@ -584,8 +601,20 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     df_donors = donor_tables
     df_donors = df_donors.reset_index(drop=True)
     if scb is not None:
-        obsqc = pandas.concat([obsqc,scb], axis = 1, join = 'outer')
-    
+        obsqc = pandas.concat([obsqc,scb], axis = 1, join = 'inner')
+
+    if len(Metadata)>0:
+        obsqc['barcode']= obsqc.index
+        obsqc = obsqc.set_index(obsqc['convoluted_samplename'].astype(str)+'__'+obsqc['donor_id'].astype(str))
+        obsqc['chromium_channel']=Metadata['chromium_channel']
+        obsqc.update(Metadata)
+        for col in Metadata.columns:
+            if col not in obsqc.columns:
+                obsqc[col] = Metadata[col]
+    if len(All_assignments)>0:
+        All_assignments = All_assignments.set_index(All_assignments['pool']+'__'+All_assignments['donor_query'])
+        obsqc['Donor id']=All_assignments['donor_gt original']
+        obsqc['Vacutainer ID']=All_assignments['donor_gt']
     fctr = 0
 
     #####################
@@ -677,7 +706,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     try:
         Valid_Droplet_percentage = metrics['Valid Barcodes'].values[0]
     except:
-        Valid_Droplet_percentage = metrics['Valid barcodes'].values[0]
+        _='metric not available'
     df1 = ad_lane_filtered.to_df()
     Number_of_cells = len(set(df1.index))
     Total_UMIs_before_10x_filter = np.sum(ad_lane_raw.X) #this may be after the normalisation
@@ -697,11 +726,8 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     UMIS_mapped_to_ribo_genes = sum(all_QC_lane.obs['total_counts_gene_group__ribo_protein'])
     UMIS_mapped_to_ribo_rna = sum(all_QC_lane.obs['total_counts_gene_group__ribo_rna'])
     UMIs_mapped_to_genes = sum(all_QC_lane.obs['total_counts'])
-    try:
-        Donor_Cohort_Assignments = pd.read_csv(f'{args.results_dir}/gtmatch/{expid}/{expid}_gt_donor_assignments.csv')
-    except:
-        Donor_Cohort_Assignments = pd.DataFrame(columns=['panel'])
-        
+
+   
     Total_donors_deconvoluted_in_pool = len(Donor_Cohort_Assignments[Donor_Cohort_Assignments['panel']!='NONE'].index)
     ELGH_Donors_Deconvoluted_in_pool = len(Donor_Cohort_Assignments[Donor_Cohort_Assignments['panel']=='GT_ELGH'].index)
     UKB_Donors_Deconvoluted_in_pool = len(Donor_Cohort_Assignments[Donor_Cohort_Assignments['panel']=='GT_UKBB'].index)
@@ -715,6 +741,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
                             'cells passing QC':[],
     }
     all_probs = pd.DataFrame()
+    all_probs = pd.DataFrame()
 
     Tranche_Pass_Fail='PASS'
     Tranche_Failure_Reason =' '
@@ -731,14 +758,29 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     except:
         _='cant validate reasoning'        
     
+    Summary_check = pd.read_csv(f'{args.results_dir}/deconvolution/vireo/{expid}/vireo_{expid}/summary.tsv',sep='\t')
+    try:
+        Doublets_donor = int(Summary_check[Summary_check['Var1']=='doublet']['Freq'].values[0])
+    except:
+        Doublets_donor = 0
+    try:
+        Unassigned_donor = int(Summary_check[Summary_check['Var1']=='unassigned']['Freq'].values[0])
+    except:
+        Unassigned_donor = 0
     for i in df_donors.index:
         print(i)
+        # if df_donors.loc[i]["donor_id"] != 'donor2':
+        #     continue
+        # else:
+        #     _=''
         Donor_Stats=[]
         row = df_donors.loc[i]
         print(row)
         path1 = row['file_path_h5ad']
         if(path1=='all_QC_lane'):
-            Deconvoluted_Donor_Data = all_QC_lane[all_QC_lane.obs['donor_id']==row['donor_id']]
+            tp1 = pd.DataFrame(all_QC_lane.obs)
+            tp1 = tp1[tp1['donor_id']==row['donor_id']].index
+            Deconvoluted_Donor_Data = adqc[tp1]
             Donor_barcodes = Deconvoluted_Donor_Data.obs.index.str.split('-').str[:2]
             Donor_barcodes = Donor_barcodes.str[0]+'-'+Donor_barcodes.str[1]
             Deconvoluted_Donor_Data.obs.index = Donor_barcodes
@@ -771,11 +813,11 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
             intersect_set = set(Donor_barcodes).intersection(set(ad_lane_filtered.obs.index))
 
             all_probs = pd.concat([all_probs,pd.DataFrame(Deconvoluted_Donor_Data.obs['prob_doublet'])])
-            Donor_qc_files = all_QC_lane[Mengled_barcodes_donor]
+            Donor_qc_files = Deconvoluted_Donor_Data
             UMIs = np.sum(Donor_qc_files.X)
-            Donor_cells_for_donor=len(all_QC_lane[Mengled_barcodes_donor].obs)
-            Donor_cells_passes_qc = len(all_QC_lane[Mengled_barcodes_donor].obs[all_QC_lane.obs['cell_passes_qc']])
-            Donor_cells_fails_qc = len(all_QC_lane[Mengled_barcodes_donor].obs[all_QC_lane.obs['cell_passes_qc']==False])
+            Donor_cells_for_donor=len(Deconvoluted_Donor_Data.obs)
+            Donor_cells_passes_qc = len(Deconvoluted_Donor_Data.obs[Deconvoluted_Donor_Data.obs['cell_passes_qc']])
+            Donor_cells_fails_qc = len(Deconvoluted_Donor_Data.obs[Deconvoluted_Donor_Data.obs['cell_passes_qc']==False])
             
             data_donor_for_stats['cells before QC filters'].append(Donor_cells_for_donor)
             data_donor_for_stats['cells failing QC'].append(Donor_cells_fails_qc)
@@ -801,10 +843,9 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
 
 
             Donor_Stats = gather_donor(
-                row["donor_id"],
-                Deconvoluted_Donor_Data,
-                ad_lane_raw,
-                azimuth_annot = azt,
+                donor_id=row["donor_id"],
+                ad= Donor_qc_files,
+                ad_lane_raw=ad_lane_raw,
                 qc_obs = obsqc,
                 columns_output = columns_output,
                 outdir = outdir,
@@ -975,6 +1016,9 @@ def set_argument_parser():
     parser.add_argument("--experiment_name", required = False,default='default',
             help="Cellbender resolution used",
             dest='experiment_name')
+    parser.add_argument("--extra_meta", required = False,default=None,
+            help="Extra Meta To merge in final files",
+            dest='extra_meta')
                 
     return parser.parse_args()
 
@@ -1008,15 +1052,40 @@ def set_argument_parser():
     Resolution = args.resolution
     
     # Load the final QCd dataset
+    a1 = glob.glob(f'{args.results_dir}/*/*/*outlier_filtered_adata.h5ad')
+    a2 =glob.glob(f'{args.results_dir}/*/*outlier_filtered_adata.h5ad')
+    a3 =glob.glob(f'{args.results_dir}/*outlier_filtered_adata.h5ad')
+    all_inter = list(set(a3).union(set(a2)).union(set(a1)))[0]
+    adqc = anndata.read_h5ad(all_inter, backed='r')
+    
+    if 'log1p_cp10k' not in adqc.layers:
+        adqc = adqc.to_memory()
+        normalized_counts = sc.pp.normalize_total(
+            adqc,
+            target_sum=1e4,
+            exclude_highly_expressed=False,
+            key_added='normalization_factor',  # add to adata.obs
+            inplace=False
+        )['X']
+        log1p_cp10k = np.log1p(normalized_counts)
+        adqc.layers['log1p_cp10k'] = log1p_cp10k
+        adqc.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'}
+        del normalized_counts
+        del log1p_cp10k
+    
     try:
-        adqc = anndata.read_h5ad(f'{args.results_dir}/merged_h5ad/outlier_filtered_adata.h5ad')
+        a1 = glob.glob(f'{args.results_dir}/*/*/*adata-normalized_*.h5ad')
+        a2 =glob.glob(f'{args.results_dir}/*/*adata-normalized_*.h5ad')
+        a3 =glob.glob(f'{args.results_dir}/*adata-normalized_*.h5ad')
+        all_inter = list(set(a3).union(set(a2)).union(set(a1)))[0]
+        adqc_norm = anndata.read_h5ad(all_inter, backed='r')
+        # Here we want to add any metadata columns that may be missed for donors. 
+        adqc.obs['phase'] = adqc_norm.obs['phase']
+        adqc.obs['S_score'] = adqc_norm.obs['S_score']
+        adqc.obs['G2M_score'] = adqc_norm.obs['G2M_score']
     except:
-        try:
-            adqc = anndata.read_h5ad(f'{args.results_dir}/adata.h5ad')
-        except:
-            d2 = glob.glob(f'{args.results_dir}/*/*/adatanormalized.h5ad')[0]
-            adqc = anndata.read_h5ad(d2)
-    # adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0]
+        _='not normalised'
+        
     fctr = 0
     data_tranche_all=[]
     data_donor_all=[]
@@ -1025,33 +1094,30 @@ def set_argument_parser():
     Sample_metadata = pd.DataFrame()
     
     # SETTING TRANCHE NAME
-    try:
-        adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore')
-    except:
-        _='no values associated'
-    try:
-        adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore')
-    except:
-        _='no values associated'
+    # try:
+    #     adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore')
+    # except:
+    #     _='no values associated'
+    # try:
+    #     adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore')
+    # except:
+    #     _='no values associated'
     
     # Now we calculate all the statistics for each of the pools.
     for expid in df_raw.index:
         print(expid)
+        # if expid != 'CRD_CMB13637311':
+        #     continue
         s = adqc.obs['convoluted_samplename'] == expid
         ad = adqc[s]
         if ad.n_obs == 0:
             continue #Here no cells has passed the qc thresholds.
         nf, data_tranche, data_donor = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution)
-        # add the stuff to the adata.
-
-        # All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt])
         data_tranche_all.append(data_tranche)
         data_donor_all= data_donor_all+data_donor
         count += 1
         fctr += nf
-        # except:
-        #     print(f"pool {expid} was ignored as it did not contain deconvoluted donors.")
-    
+
     Donor_Report = pd.DataFrame(data_donor_all)
     Tranche_Report = pd.DataFrame(data_tranche_all)
 
diff --git a/conf/base.conf b/conf/base.conf
index 94855bc0..92557dc2 100755
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -264,6 +264,11 @@ process {
         time   = 12.h
         maxRetries    = 3
     }
+    withName: PREPROCESS_GENOTYPES{
+        cpus   = 1
+        time   = 12.h
+        maxRetries    = 3
+    }
 
     withName: HARMONY{
         cpus   = 1
diff --git a/conf/modules.conf b/conf/modules.conf
index e9855860..8b403b27 100755
--- a/conf/modules.conf
+++ b/conf/modules.conf
@@ -292,7 +292,7 @@ process {
     }
 
     withName: DOUBLET_FINDER{
-        memory = { 100.GB * task.attempt }
+        memory = { 20.GB * task.attempt }
     }
 
     
diff --git a/main.nf b/main.nf
index b5b72839..8c090b03 100755
--- a/main.nf
+++ b/main.nf
@@ -54,6 +54,9 @@ workflow MAIN {
         RETRIEVE_RECOURSES_TEST_DATASET(out_ch)
         input_channel = RETRIEVE_RECOURSES_TEST_DATASET.out.input_channel
         vcf_inputs = RETRIEVE_RECOURSES_TEST_DATASET.out.vcf_inputs
+        vcf_inputs.splitCsv(header: true, sep: '\t')
+                    .map { row -> tuple(row.label, file(row.vcf_file_path), file("${row.vcf_file_path}.csi")) }
+                    .set { vcf_inputs }
     }else{
         input_channel = Channel.fromPath(params.input_data_table, followLinks: true, checkIfExists: true)
         if (params.genotype_input.run_with_genotype_input) {
diff --git a/modules/nf-core/modules/gather_data/main.nf b/modules/nf-core/modules/gather_data/main.nf
index 6876ed02..dd5ac647 100755
--- a/modules/nf-core/modules/gather_data/main.nf
+++ b/modules/nf-core/modules/gather_data/main.nf
@@ -32,6 +32,12 @@ process GATHER_DATA{
         cellbender_input='cellbender'
       }
 
+      if ("${params.extra_sample_metadata}" != ''){
+        extra_meta = "--extra_meta=${params.extra_sample_metadata}"
+      }else{
+        extra_meta = ""
+      }
+
       """
         echo ${dummy_val}
         gather_minimal_dataset.py \
@@ -41,7 +47,7 @@ process GATHER_DATA{
           --cellbender=${cellbender_input} \
           --resolution=${params.cellbender_resolution_to_use} \
           --write_h5=${params.write_h5} \
-          --experiment_name=${params.RUN}
+          --experiment_name=${params.RUN} ${extra_meta}
       """
 }
 
diff --git a/subworkflows/data_handover.nf b/subworkflows/data_handover.nf
index f8c8d60e..a0c9de6f 100755
--- a/subworkflows/data_handover.nf
+++ b/subworkflows/data_handover.nf
@@ -2,7 +2,7 @@ include { GATHER_DATA;  SPLIT_DATA_BY_STUDY} from "$projectDir/modules/nf-core/m
 include { ENCRYPT_DIR; ENCRYPT_TARGET } from "$projectDir/modules/local/encrypt"
 include { TRANSFER;SUMMARY_STATISTICS_PLOTS } from "$projectDir/modules/nf-core/modules/summary_statistics_plots/main"
 include { split_bam_by_donor } from "$projectDir/modules/local/cellranger_bam_per_donor"
-include { SUBSET_BAM_PER_BARCODES } from "$projectDir/modules/nf-core/modules/subset_bam_per_barcodes_and_variants/main"
+// include { SUBSET_BAM_PER_BARCODES } from "$projectDir/modules/nf-core/modules/subset_bam_per_barcodes_and_variants/main"
 
 workflow data_handover{
     take:
@@ -27,11 +27,13 @@ workflow data_handover{
 
         if (params.split_bam){
             // val(sample), path(barcodes), path(bam)
+            GATHER_DATA.out.barcodes_files.subscribe{ println "barcodes_files: $it" }
             GATHER_DATA.out.barcodes_files.flatten().map{sample -> tuple("${sample}".replaceFirst(/.*\//,"").replaceFirst(/\..*/,""),"${sample}".replaceFirst(/.*\//,"").replaceFirst(/\.tsv.*/,""),sample)}.set{barcodes}
             barcodes.combine(sample_possorted_bam_vireo_donor_ids, by: 0).set{full_split_chanel_input}
-            // full_split_chanel_input.subscribe { println "full_split_chanel_input: $it" }
+            barcodes.subscribe { println "barcodes: $it" }
+            full_split_chanel_input.subscribe { println "full_split_chanel_input: $it" }
             // genome.subscribe { println "genome: $it" }
-            SUBSET_BAM_PER_BARCODES(full_split_chanel_input,genome)
+            // SUBSET_BAM_PER_BARCODES(full_split_chanel_input,genome)
             // split_bam_by_donor(sample_possorted_bam_vireo_donor_ids, genome)
             // ENCRYPT_TARGET(split_bam_by_donor.out.possorted_cram_files)
             // cram_encrypted_dirs = ENCRYPT_TARGET.out.encrypted_dir

From 158ab1e9c76ed9917154b4d7b3c008c5754c1c60 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Fri, 13 Dec 2024 14:54:17 +0000
Subject: [PATCH 2/2] add

---
 bin/strip_citeseq.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/bin/strip_citeseq.py b/bin/strip_citeseq.py
index c5c0cb58..e7c16360 100755
--- a/bin/strip_citeseq.py
+++ b/bin/strip_citeseq.py
@@ -22,6 +22,7 @@
 import click
 import logging
 import os
+import re
 compression_opts = 'gzip'
 filter_0_count_cells=False
 
@@ -286,6 +287,7 @@ def main():
     adata_cellranger_filtered = sc.read_10x_mtx(
         options.raw_data, var_names='gene_symbols', make_unique=True,
         cache=False, cache_compression=compression_opts,gex_only=False)
+    all_feature_types = set(adata_cellranger_filtered.var['feature_types'])
 
     for modality1 in set(adata_cellranger_filtered.var.feature_types):
         # {'Gene Expression', 'Multiplexing Capture', 'Antibody Capture'}
@@ -296,16 +298,31 @@ def main():
         # adata2 = adata_antibody[adata_antibody.obs_names.difference(zero_count_cells, sort=False)]
         # if(adata2.shape[0]>0):
         #     # Here we have actually captured some of the reads in the antibody dataset.
+
         if (modality=='Gene_Expression'):
             adata_antibody.write(
                 f'{modality}-{options.outname}.h5ad',
                 compression='gzip'
             )
-        _ = cellbender_to_tenxmatrix(
-            adata_antibody,
-            out_file='',
-            out_dir=f'{options.outname}__{modality}'
-        )
+        elif (modality=='Multiplexing_Capture'):
+            all_indexes_multiplexing = set(adata_antibody.var.index).union(set(multiplexing_capure.columns))
+            adata_antibody = adata_cellranger_filtered[:,list(all_indexes_multiplexing)]
+            df = pd.DataFrame(adata_antibody.X.toarray(), index=adata_antibody.obs_names, columns=adata_antibody.var_names)
+            df.to_csv(f'{options.outname}__{modality}.tsv',sep='\t')
+            
+        else:
+            df = pd.DataFrame(adata_antibody.X.toarray(), index=adata_antibody.obs_names, columns=adata_antibody.var_names)
+            df.to_csv(f'{options.outname}__{modality}.tsv',sep='\t')
+            adata_antibody.var.index
+
+        if len(all_feature_types)>1:
+            _ = cellbender_to_tenxmatrix(
+                adata_antibody,
+                out_file='',
+                out_dir=f'{options.outname}__{modality}'
+            )
+        else:
+            os.system(f"ln -s {options.raw_data} {options.outname}__{modality}")
     
     # adata_gex = adata_cellranger_filtered[:,adata_cellranger_filtered.var.query('feature_types=="Gene Expression"').index]
     # adata_cellbender = anndata_from_h5('/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/ania/analysis_trego/work/5d/6a30871e864ed7bc03e949ef846a1d/cellbender_FPR_0.1_filtered.h5',