Merge pull request #24 from IlyaLab/main

update to the docs branch
IlyaLab · Feb 9, 2024 · 1a85476 · 1a85476
2 parents 340d2db + 6bd7951
commit 1a85476
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 20 deletions.
diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py
@@ -81,15 +81,15 @@ def run_gssnng(
 
     samp_neighbors = None
     error_checking(mat, samp_neighbors, recompute_neighbors,
-                   gs_obj, score_method, ranked, method_params)
+                   gs_obj, score_method, ranked, method_params, 0)
 
     if method_params == None:
         method_params = dict()
 
     # score each cell with the list of gene sets
     all_scores = _proc_data(mat, gs_obj, groupby, smooth_mode, recompute_neighbors,
                                   score_method, method_params, samp_neighbors,
-                                  noise_trials, ranked, cores)
+                                  noise_trials, ranked, cores, 0)
 
     # warning: the all_scores rows might have a diferent order!
     # make sure to resort them according to the mat.obs.index
@@ -154,15 +154,15 @@ def with_gene_sets(
 
     samp_neighbors = None
     error_checking(adata, samp_neighbors, recompute_neighbors,
-                   gs_obj, score_method, ranked, method_params)
+                   gs_obj, score_method, ranked, method_params, 0)
 
     if method_params == None:
         method_params = dict()
 
     # score each cell with the list of gene sets
     all_scores = _proc_data(adata, gs_obj, groupby, smooth_mode, recompute_neighbors,
                                   score_method, method_params, samp_neighbors,
-                                  noise_trials, ranked, cores)
+                                  noise_trials, ranked, cores, 0)
     ## join in new results
     adata.obs = adata.obs.join(all_scores, how='left')
 
@@ -229,7 +229,8 @@ def _proc_data(
         samp_neighbors: int,
         noise_trials: int,
         ranked: bool,
-        cores: int
+        cores: int,
+        return_data: int
                      ):
     """
     In many cases, the neighbors should be defined.  If you have mixed clinical endpoints,
@@ -247,6 +248,7 @@ def _proc_data(
     :param noise_trials: number of noisy samples to create, integer
     :param ranked: whether the gene expression counts should be rank ordered
     :param cores: number of parallel processes to work through groupby groups
+    :param return_data: should the smoothed data list be returned?
 
     :returns: scores in a dict for each cell in a list.
     """
@@ -284,6 +286,9 @@ def _proc_data(
     data_list = _build_data_list(adata, groupby, cats, recompute_neighbors, samp_neighbors, smooth_mode)
     # then we can start scoring cells #
 
+    if return_data == 1:
+        return(data_list)
+
     # building up the argument list for the parallel call of _score_all_cells_all_sets
     arglist = []
     for smoothed_adata, groupname in data_list:

diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py
@@ -0,0 +1,52 @@
+import anndata
+from gssnng.score_cells import _proc_data
+from gssnng.util import error_checking
+from typing import Union
+
+def smooth_anndata(
+        adata: anndata.AnnData,
+        groupby: Union[str, list, dict],
+        smooth_mode: str,
+        recompute_neighbors: int,
+        method_params: dict,
+        cores: int
+    ) -> anndata.AnnData:
+
+    """
+    nearest neighbor smoothing of the expression matrix
+
+    :param adata
+        anndata.AnnData containing the cells to be scored
+    :param groupby
+        either a column label in adata.obs, and all categories taken, or a dict specifies one group.
+    :param smooth_mode
+        `adjacency` or `connectivity`, which representation of the neighborhood graph to use.
+        `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more
+    :param recompute_neighbors
+        should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N
+    :param method_params
+        specific params for each method.
+    :param cores
+        number of parallel processes to work through groupby groups
+
+    :returns: a list of adatas with smoothed data
+    """
+
+    return_data = 1
+    noise_trials = 0  ### not used currently
+    samp_neighbors = None ### also not used
+    just_smoothing=1
+
+    error_checking(adata, samp_neighbors, recompute_neighbors,
+                   None, None, None, method_params, just_smoothing)
+
+    if method_params == None:
+        method_params = dict()
+
+    # score each cell with the list of gene sets
+    data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors,
+                                  None, method_params, samp_neighbors,
+                                  noise_trials, None, cores, return_data)
+
+    print("**done**")
+    return(data_list)
diff --git a/gssnng/test/test_return_smoothed.py b/gssnng/test/test_return_smoothed.py
@@ -0,0 +1,29 @@
+if __name__ == '__main__':
+
+    import scanpy as sc
+    from gssnng.smooth_anndatas import smooth_anndata
+    import time
+
+    def test_return_smoothed(adata):
+        res0 = smooth_anndata(adata=adata,
+                              groupby='louvain',
+                              smooth_mode='adjacency',
+                              recompute_neighbors=32,
+                              method_params={},
+                              cores=4)
+        return(res0)
+
+
+    def test_score_all_sets():
+        q = sc.datasets.pbmc3k_processed()
+        t0 = time.time()
+        print('start time: ' + str(t0))
+        data_list = test_return_smoothed(q)
+        print('******DONE*******')
+        t1 = time.time()
+        print('end time: ' + str(t1))
+        print('TOTAL TIME: ' + str(t1-t0))
+        print(len(data_list))
+
+    test_score_all_sets()
+    print('test done')
diff --git a/gssnng/util.py b/gssnng/util.py
@@ -14,7 +14,8 @@ def error_checking(
         gs_obj,
         score_method,
         ranked,
-        method_params
+        method_params,
+        just_smoothing
 ):
     """
     QC on the adata. Need to make sure there's enough neighbors available given the sampling size.
@@ -23,30 +24,33 @@ def error_checking(
     :param samp_neighbors: integer, number of neighbors to sample
     """
 
-    if type(method_params) != type(dict()):
-        raise Exception('ERROR: please use a dictionary to pass method params')
-
-    if any([xi in adata.obs.columns for xi in gs_obj.get_gs_names()]):
-        #raise Exception('ERROR: gene set names in columns of adata.obs, please drop.')
-        print("Warning! Dropping gene set names from obs!")
-        genesetlist = [x.name for x in gs_obj.set_list]
-        for gsi in genesetlist:
-            print('dropping: ' + gsi)
-            adata.obs.drop(columns=[gsi], inplace=True)
-
     if 'gssnng_groupby' in adata.obs.columns:
         adata.obs.drop(columns='gssnng_groupby', inplace=True)
         #raise Exception("Error: please drop 'gssnng_groupby' as a column name.")
         print('... and dropping gssnng_groupby column...')
 
-    if ranked == False and score_method == 'singscore':
-        raise Exception('ERROR: singscore requires ranked data, set ranked parameter to True')
-
     if (recompute_neighbors == None) or (recompute_neighbors == 0):
         n_neighbors = adata.uns['neighbors']['params']['n_neighbors'] #[0]# in older AnnData versions need this??
     else:
         n_neighbors = recompute_neighbors
 
+    if just_smoothing == 0:
+        # then do all other checks
+        if type(method_params) != type(dict()):
+            raise Exception('ERROR: please use a dictionary to pass method params')
+
+        if any([xi in adata.obs.columns for xi in gs_obj.get_gs_names()]):
+            #raise Exception('ERROR: gene set names in columns of adata.obs, please drop.')
+            print("Warning! Dropping gene set names from obs!")
+            genesetlist = [x.name for x in gs_obj.set_list]
+            for gsi in genesetlist:
+                print('dropping: ' + gsi)
+                adata.obs.drop(columns=[gsi], inplace=True)
+
+        if ranked == False and score_method == 'singscore':
+            raise Exception('ERROR: singscore requires ranked data, set ranked parameter to True')
+
+
     #if n_neighbors < samp_neighbors:
     #    print('*******')
     #    print('WARNING: Number of neighbors too low for sampling parameter!')