From 772664c9bbe4498776eb324d7428635a56ddb946 Mon Sep 17 00:00:00 2001 From: gibbsdavidl Date: Tue, 2 Jan 2024 16:03:34 -0800 Subject: [PATCH 1/4] added return smoothed anndatas --- gssnng/score_cells.py | 7 +++- gssnng/smooth_anndatas.py | 55 +++++++++++++++++++++++++++++ gssnng/test/test_return_smoothed.py | 29 +++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 gssnng/smooth_anndatas.py create mode 100644 gssnng/test/test_return_smoothed.py diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py index 469c882..51abd82 100644 --- a/gssnng/score_cells.py +++ b/gssnng/score_cells.py @@ -229,7 +229,8 @@ def _proc_data( samp_neighbors: int, noise_trials: int, ranked: bool, - cores: int + cores: int, + return_data: int ): """ In many cases, the neighbors should be defined. If you have mixed clinical endpoints, @@ -247,6 +248,7 @@ def _proc_data( :param noise_trials: number of noisy samples to create, integer :param ranked: whether the gene expression counts should be rank ordered :param cores: number of parallel processes to work through groupby groups + :param return_data: should the smoothed data list be returned? :returns: scores in a dict for each cell in a list. """ @@ -284,6 +286,9 @@ def _proc_data( data_list = _build_data_list(adata, groupby, cats, recompute_neighbors, samp_neighbors, smooth_mode) # then we can start scoring cells # + if return_data == 1: + return(data_list) + # building up the argument list for the parallel call of _score_all_cells_all_sets arglist = [] for smoothed_adata, groupname in data_list: diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py new file mode 100644 index 0000000..1c0ead6 --- /dev/null +++ b/gssnng/smooth_anndatas.py @@ -0,0 +1,55 @@ +import anndata +from gssnng.score_cells import _proc_data +#from gssnng.util import error_checking +from typing import Union + +def smooth_anndata( + adata: anndata.AnnData, + groupby: Union[str, list, dict], + smooth_mode: str, + recompute_neighbors: int, + method_params: dict, + cores: int + ) -> anndata.AnnData: + + """ + gene set scoring (all gene sets in file) with nearest neighbor smoothing of the expression matrix + + Improved single cell scoring by: + - smoothing the data matrix + - adding noise to the nearest neighbor smoothing via `samp_neighbors` + - adding noise to the expression data itself (via noise_trials) + + :param adata + anndata.AnnData containing the cells to be scored + :param groupby + either a column label in adata.obs, and all categories taken, or a dict specifies one group. + :param smooth_mode + `adjacency` or `connectivity`, which representation of the neighborhood graph to use. + `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more + :param recompute_neighbors + should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N + :param method_params + specific params for each method. + :param cores + number of parallel processes to work through groupby groups + + :returns: adata with gene set scores in .obs + """ + + return_data = 1 + noise_trials = 0 ### not used currently + samp_neighbors = None + + #error_checking2(adata, recompute_neighbors, method_params) # UPDATE + + if method_params == None: + method_params = dict() + + # score each cell with the list of gene sets + data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors, + None, method_params, None, + noise_trials, None, cores, return_data) + + print("**done**") + return(data_list) diff --git a/gssnng/test/test_return_smoothed.py b/gssnng/test/test_return_smoothed.py new file mode 100644 index 0000000..d75b33c --- /dev/null +++ b/gssnng/test/test_return_smoothed.py @@ -0,0 +1,29 @@ +if __name__ == '__main__': + + import scanpy as sc + from gssnng.smooth_anndatas import smooth_anndata + import time + + def test_return_smoothed(adata): + res0 = smooth_anndata(adata=adata, + groupby='louvain', + smooth_mode='adjacency', + recompute_neighbors=32, + method_params={}, + cores=4) + return(res0) + + + def test_score_all_sets(): + q = sc.datasets.pbmc3k_processed() + t0 = time.time() + print('start time: ' + str(t0)) + data_list = test_return_smoothed(q) + print('******DONE*******') + t1 = time.time() + print('end time: ' + str(t1)) + print('TOTAL TIME: ' + str(t1-t0)) + print(len(data_list)) + + test_score_all_sets() + print('test done') From 9e0cdc034c0500d6eb52795aa19a48628ea4352b Mon Sep 17 00:00:00 2001 From: David L Gibbs Date: Wed, 3 Jan 2024 15:40:10 -0800 Subject: [PATCH 2/4] Fix bug in _proc_data return data param missing --- gssnng/score_cells.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py index 51abd82..3fbd376 100644 --- a/gssnng/score_cells.py +++ b/gssnng/score_cells.py @@ -89,7 +89,7 @@ def run_gssnng( # score each cell with the list of gene sets all_scores = _proc_data(mat, gs_obj, groupby, smooth_mode, recompute_neighbors, score_method, method_params, samp_neighbors, - noise_trials, ranked, cores) + noise_trials, ranked, cores, 0) # warning: the all_scores rows might have a diferent order! # make sure to resort them according to the mat.obs.index @@ -162,7 +162,7 @@ def with_gene_sets( # score each cell with the list of gene sets all_scores = _proc_data(adata, gs_obj, groupby, smooth_mode, recompute_neighbors, score_method, method_params, samp_neighbors, - noise_trials, ranked, cores) + noise_trials, ranked, cores, 0) ## join in new results adata.obs = adata.obs.join(all_scores, how='left') From 69491dd0d04c8e8dfe2a5d677105a18a2757d272 Mon Sep 17 00:00:00 2001 From: David L Gibbs Date: Wed, 3 Jan 2024 17:10:03 -0800 Subject: [PATCH 3/4] updated the doc strings for smooth anndata --- gssnng/smooth_anndatas.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py index 1c0ead6..6f10803 100644 --- a/gssnng/smooth_anndatas.py +++ b/gssnng/smooth_anndatas.py @@ -13,12 +13,7 @@ def smooth_anndata( ) -> anndata.AnnData: """ - gene set scoring (all gene sets in file) with nearest neighbor smoothing of the expression matrix - - Improved single cell scoring by: - - smoothing the data matrix - - adding noise to the nearest neighbor smoothing via `samp_neighbors` - - adding noise to the expression data itself (via noise_trials) + nearest neighbor smoothing of the expression matrix :param adata anndata.AnnData containing the cells to be scored @@ -34,7 +29,7 @@ def smooth_anndata( :param cores number of parallel processes to work through groupby groups - :returns: adata with gene set scores in .obs + :returns: a list of adatas with smoothed data """ return_data = 1 From 51be2a4023dcf61837162980ae33cda403b7cbfd Mon Sep 17 00:00:00 2001 From: gibbsdavidl Date: Thu, 4 Jan 2024 11:10:33 -0800 Subject: [PATCH 4/4] added error checking to return smoothed anndatas --- gssnng/score_cells.py | 4 ++-- gssnng/smooth_anndatas.py | 10 ++++++---- gssnng/util.py | 34 +++++++++++++++++++--------------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py index 3fbd376..69eae1e 100644 --- a/gssnng/score_cells.py +++ b/gssnng/score_cells.py @@ -81,7 +81,7 @@ def run_gssnng( samp_neighbors = None error_checking(mat, samp_neighbors, recompute_neighbors, - gs_obj, score_method, ranked, method_params) + gs_obj, score_method, ranked, method_params, 0) if method_params == None: method_params = dict() @@ -154,7 +154,7 @@ def with_gene_sets( samp_neighbors = None error_checking(adata, samp_neighbors, recompute_neighbors, - gs_obj, score_method, ranked, method_params) + gs_obj, score_method, ranked, method_params, 0) if method_params == None: method_params = dict() diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py index 6f10803..f76f34e 100644 --- a/gssnng/smooth_anndatas.py +++ b/gssnng/smooth_anndatas.py @@ -1,6 +1,6 @@ import anndata from gssnng.score_cells import _proc_data -#from gssnng.util import error_checking +from gssnng.util import error_checking from typing import Union def smooth_anndata( @@ -34,16 +34,18 @@ def smooth_anndata( return_data = 1 noise_trials = 0 ### not used currently - samp_neighbors = None + samp_neighbors = None ### also not used + just_smoothing=1 - #error_checking2(adata, recompute_neighbors, method_params) # UPDATE + error_checking(adata, samp_neighbors, recompute_neighbors, + None, None, None, method_params, just_smoothing) if method_params == None: method_params = dict() # score each cell with the list of gene sets data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors, - None, method_params, None, + None, method_params, samp_neighbors, noise_trials, None, cores, return_data) print("**done**") diff --git a/gssnng/util.py b/gssnng/util.py index b155e89..5a2a104 100644 --- a/gssnng/util.py +++ b/gssnng/util.py @@ -14,7 +14,8 @@ def error_checking( gs_obj, score_method, ranked, - method_params + method_params, + just_smoothing ): """ QC on the adata. Need to make sure there's enough neighbors available given the sampling size. @@ -23,30 +24,33 @@ def error_checking( :param samp_neighbors: integer, number of neighbors to sample """ - if type(method_params) != type(dict()): - raise Exception('ERROR: please use a dictionary to pass method params') - - if any([xi in adata.obs.columns for xi in gs_obj.get_gs_names()]): - #raise Exception('ERROR: gene set names in columns of adata.obs, please drop.') - print("Warning! Dropping gene set names from obs!") - genesetlist = [x.name for x in gs_obj.set_list] - for gsi in genesetlist: - print('dropping: ' + gsi) - adata.obs.drop(columns=[gsi], inplace=True) - if 'gssnng_groupby' in adata.obs.columns: adata.obs.drop(columns='gssnng_groupby', inplace=True) #raise Exception("Error: please drop 'gssnng_groupby' as a column name.") print('... and dropping gssnng_groupby column...') - if ranked == False and score_method == 'singscore': - raise Exception('ERROR: singscore requires ranked data, set ranked parameter to True') - if (recompute_neighbors == None) or (recompute_neighbors == 0): n_neighbors = adata.uns['neighbors']['params']['n_neighbors'] #[0]# in older AnnData versions need this?? else: n_neighbors = recompute_neighbors + if just_smoothing == 0: + # then do all other checks + if type(method_params) != type(dict()): + raise Exception('ERROR: please use a dictionary to pass method params') + + if any([xi in adata.obs.columns for xi in gs_obj.get_gs_names()]): + #raise Exception('ERROR: gene set names in columns of adata.obs, please drop.') + print("Warning! Dropping gene set names from obs!") + genesetlist = [x.name for x in gs_obj.set_list] + for gsi in genesetlist: + print('dropping: ' + gsi) + adata.obs.drop(columns=[gsi], inplace=True) + + if ranked == False and score_method == 'singscore': + raise Exception('ERROR: singscore requires ranked data, set ranked parameter to True') + + #if n_neighbors < samp_neighbors: # print('*******') # print('WARNING: Number of neighbors too low for sampling parameter!')