Merge branch 'v1.7' into v1.8_pre

wtsi-hgi · Dec 13, 2024 · a745839 · a745839
2 parents 1c72cf3 + 6726a96
commit a745839
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 34 deletions.
diff --git a/bin/split_h5ad_per_donor.py b/bin/split_h5ad_per_donor.py
@@ -154,37 +154,13 @@ def split_h5ad_per_donor(vireo_donor_ids_tsv, filtered_matrix_h5, samplename,
     else:
         logging.info('Samples are not deconvoluted')
         adata.obs['convoluted_samplename'] = samplename
-        method = 'Scrublet'
-        # here we add doublets from multiplet scrubblet output
-        # scrublet = '222CC24C93C884F3-Card_Val11211773-scrublet.tsv.gz'
-        try:
-            scrublet_data = pd.read_csv(scrublet,compression='gzip',sep='\t')
-        except:
-            scrublet_data = pd.read_csv(scrublet,sep='\t')
-        doublets = scrublet_data[scrublet_data['scrublet__predicted_multiplet']]
-        doublets_nr = len(doublets)
-        donor_cell_nr = len(scrublet_data)-doublets_nr
+        method = 'After cellbender'
+        donor_cell_nr = len(adata.obs)
         cells_per_donor_count_dic = {}
         cells_per_donor_count_dic[1]={"donor_id":'donor','n_cells':donor_cell_nr}
-        cells_per_donor_count_dic[2]={'donor_id':'doblets','n_cells':doublets_nr}
+        adata.obs['donor_id']='donor'
+        # cells_per_donor_count_dic[2]={'donor_id':'doblets','n_cells':doublets_nr}
         cells_per_donor_count = pd.DataFrame(cells_per_donor_count_dic).T
-        try:
-            idt = 'cell_barcode'
-            scrublet_data=scrublet_data.set_index('cell_barcode')
-        except:
-            idt = 'barcodes'
-            scrublet_data=scrublet_data.set_index('barcodes')
-        scrublet_data['donor_id']='donor'
-        scrublet_data.loc[list(doublets[idt]),'donor_id']='doublet'
-        scrublet_data['prob_doublet']=scrublet_data['scrublet__multiplet_scores']
-        for new_cell_annotation in ['donor_id','prob_max','prob_doublet','n_vars','best_singlet','best_doublet']:
-            try:
-                adata.obs[new_cell_annotation] = scrublet_data[new_cell_annotation]
-            except:
-                adata.obs[new_cell_annotation] = 'nan'
-
-
-
 
     # plot n cells per deconvoluted Vireo donor:
     if plot_n_cells_per_vireo_donor:
@@ -198,7 +174,7 @@ def split_h5ad_per_donor(vireo_donor_ids_tsv, filtered_matrix_h5, samplename,
                                                    axis_text_y=plt9.element_text(colour="black"))
         gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
         gplt = gplt + plt9.geom_text(plt9.aes(label='n_cells'))
-        gplt = gplt + plt9.labels.ggtitle(f'{method} deconvolution\nnumber of cells per deconvoluted donor\nsample: ' + samplename)
+        gplt = gplt + plt9.labels.ggtitle(f'{method} \nnumber of cells per deconvoluted donor\nsample: ' + samplename)
         gplt = gplt + plt9.labels.xlab('deconvoluted donor')
         gplt = gplt + plt9.labels.ylab(f'Number of cells assigned by {method}')
 

diff --git a/bin/totalVI.py b/bin/totalVI.py
@@ -41,7 +41,7 @@
 
 #import single cell data and CITE-seq data
 # SLEmap = sc.read('adata-normalized.h5ad')
-SLEmap = sc.read(options.h5ad_file)
+SLEmap = sc.read(options.h5ad_file, backed ='r')
 
 all_cite_files = glob.glob("./*/*.matrix.csv")
 CITE = pd.DataFrame()
@@ -59,12 +59,13 @@
 SLEmap.obsm['protein_expression'] = CITE_2
 
 # keep only cells passing QC and highly variable genes
-SLEmap = SLEmap[SLEmap.obs["cell_passes_qc"],:]
-SLEmap = SLEmap[SLEmap.obs["cell_passes_hard_filters"],:]
-SLEmap = SLEmap[:,SLEmap.var["highly_variable"]]
+SLEmap = SLEmap[
+    (SLEmap.obs["cell_passes_qc"] & SLEmap.obs["cell_passes_hard_filters"]),
+    SLEmap.var["highly_variable"]
+]
 
 #run totalVI
-SLEmap = SLEmap.copy()
+SLEmap = SLEmap.to_memory().copy()
 scvi.model.TOTALVI.setup_anndata(SLEmap, protein_expression_obsm_key="protein_expression",batch_key="experiment_id")
 model = scvi.model.TOTALVI(SLEmap,latent_distribution="normal",n_layers_decoder=2)