diff --git a/bin/split_h5ad_per_donor.py b/bin/split_h5ad_per_donor.py index 26a7979..5c3d0f4 100755 --- a/bin/split_h5ad_per_donor.py +++ b/bin/split_h5ad_per_donor.py @@ -154,37 +154,13 @@ def split_h5ad_per_donor(vireo_donor_ids_tsv, filtered_matrix_h5, samplename, else: logging.info('Samples are not deconvoluted') adata.obs['convoluted_samplename'] = samplename - method = 'Scrublet' - # here we add doublets from multiplet scrubblet output - # scrublet = '222CC24C93C884F3-Card_Val11211773-scrublet.tsv.gz' - try: - scrublet_data = pd.read_csv(scrublet,compression='gzip',sep='\t') - except: - scrublet_data = pd.read_csv(scrublet,sep='\t') - doublets = scrublet_data[scrublet_data['scrublet__predicted_multiplet']] - doublets_nr = len(doublets) - donor_cell_nr = len(scrublet_data)-doublets_nr + method = 'After cellbender' + donor_cell_nr = len(adata.obs) cells_per_donor_count_dic = {} cells_per_donor_count_dic[1]={"donor_id":'donor','n_cells':donor_cell_nr} - cells_per_donor_count_dic[2]={'donor_id':'doblets','n_cells':doublets_nr} + adata.obs['donor_id']='donor' + # cells_per_donor_count_dic[2]={'donor_id':'doblets','n_cells':doublets_nr} cells_per_donor_count = pd.DataFrame(cells_per_donor_count_dic).T - try: - idt = 'cell_barcode' - scrublet_data=scrublet_data.set_index('cell_barcode') - except: - idt = 'barcodes' - scrublet_data=scrublet_data.set_index('barcodes') - scrublet_data['donor_id']='donor' - scrublet_data.loc[list(doublets[idt]),'donor_id']='doublet' - scrublet_data['prob_doublet']=scrublet_data['scrublet__multiplet_scores'] - for new_cell_annotation in ['donor_id','prob_max','prob_doublet','n_vars','best_singlet','best_doublet']: - try: - adata.obs[new_cell_annotation] = scrublet_data[new_cell_annotation] - except: - adata.obs[new_cell_annotation] = 'nan' - - - # plot n cells per deconvoluted Vireo donor: if plot_n_cells_per_vireo_donor: @@ -198,7 +174,7 @@ def split_h5ad_per_donor(vireo_donor_ids_tsv, filtered_matrix_h5, samplename, axis_text_y=plt9.element_text(colour="black")) gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') gplt = gplt + plt9.geom_text(plt9.aes(label='n_cells')) - gplt = gplt + plt9.labels.ggtitle(f'{method} deconvolution\nnumber of cells per deconvoluted donor\nsample: ' + samplename) + gplt = gplt + plt9.labels.ggtitle(f'{method} \nnumber of cells per deconvoluted donor\nsample: ' + samplename) gplt = gplt + plt9.labels.xlab('deconvoluted donor') gplt = gplt + plt9.labels.ylab(f'Number of cells assigned by {method}') diff --git a/bin/totalVI.py b/bin/totalVI.py index 5907a64..724a1a3 100755 --- a/bin/totalVI.py +++ b/bin/totalVI.py @@ -41,7 +41,7 @@ #import single cell data and CITE-seq data # SLEmap = sc.read('adata-normalized.h5ad') -SLEmap = sc.read(options.h5ad_file) +SLEmap = sc.read(options.h5ad_file, backed ='r') all_cite_files = glob.glob("./*/*.matrix.csv") CITE = pd.DataFrame() @@ -59,12 +59,13 @@ SLEmap.obsm['protein_expression'] = CITE_2 # keep only cells passing QC and highly variable genes -SLEmap = SLEmap[SLEmap.obs["cell_passes_qc"],:] -SLEmap = SLEmap[SLEmap.obs["cell_passes_hard_filters"],:] -SLEmap = SLEmap[:,SLEmap.var["highly_variable"]] +SLEmap = SLEmap[ + (SLEmap.obs["cell_passes_qc"] & SLEmap.obs["cell_passes_hard_filters"]), + SLEmap.var["highly_variable"] +] #run totalVI -SLEmap = SLEmap.copy() +SLEmap = SLEmap.to_memory().copy() scvi.model.TOTALVI.setup_anndata(SLEmap, protein_expression_obsm_key="protein_expression",batch_key="experiment_id") model = scvi.model.TOTALVI(SLEmap,latent_distribution="normal",n_layers_decoder=2)