From 772664c9bbe4498776eb324d7428635a56ddb946 Mon Sep 17 00:00:00 2001
From: gibbsdavidl <gibbsdavidl@gmail.com>
Date: Tue, 2 Jan 2024 16:03:34 -0800
Subject: [PATCH 1/4] added return smoothed anndatas

---
 gssnng/score_cells.py               |  7 +++-
 gssnng/smooth_anndatas.py           | 55 +++++++++++++++++++++++++++++
 gssnng/test/test_return_smoothed.py | 29 +++++++++++++++
 3 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 gssnng/smooth_anndatas.py
 create mode 100644 gssnng/test/test_return_smoothed.py

diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py
index 469c882..51abd82 100644
--- a/gssnng/score_cells.py
+++ b/gssnng/score_cells.py
@@ -229,7 +229,8 @@ def _proc_data(
         samp_neighbors: int,
         noise_trials: int,
         ranked: bool,
-        cores: int
+        cores: int,
+        return_data: int
                      ):
     """
     In many cases, the neighbors should be defined.  If you have mixed clinical endpoints,
@@ -247,6 +248,7 @@ def _proc_data(
     :param noise_trials: number of noisy samples to create, integer
     :param ranked: whether the gene expression counts should be rank ordered
     :param cores: number of parallel processes to work through groupby groups
+    :param return_data: should the smoothed data list be returned?
 
     :returns: scores in a dict for each cell in a list.
     """
@@ -284,6 +286,9 @@ def _proc_data(
     data_list = _build_data_list(adata, groupby, cats, recompute_neighbors, samp_neighbors, smooth_mode)
     # then we can start scoring cells #
 
+    if return_data == 1:
+        return(data_list)
+
     # building up the argument list for the parallel call of _score_all_cells_all_sets
     arglist = []
     for smoothed_adata, groupname in data_list:
diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py
new file mode 100644
index 0000000..1c0ead6
--- /dev/null
+++ b/gssnng/smooth_anndatas.py
@@ -0,0 +1,55 @@
+import anndata
+from gssnng.score_cells import _proc_data
+#from gssnng.util import error_checking
+from typing import Union
+
+def smooth_anndata(
+        adata: anndata.AnnData,
+        groupby: Union[str, list, dict],
+        smooth_mode: str,
+        recompute_neighbors: int,
+        method_params: dict,
+        cores: int
+    ) -> anndata.AnnData:
+
+    """
+    gene set scoring (all gene sets in file) with nearest neighbor smoothing of the expression matrix
+
+    Improved single cell scoring by:
+    - smoothing the data matrix
+        - adding noise to the nearest neighbor smoothing via `samp_neighbors`
+    - adding noise to the expression data itself (via noise_trials)
+
+    :param adata
+        anndata.AnnData containing the cells to be scored
+    :param groupby
+        either a column label in adata.obs, and all categories taken, or a dict specifies one group.
+    :param smooth_mode
+        `adjacency` or `connectivity`, which representation of the neighborhood graph to use.
+        `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more
+    :param recompute_neighbors
+        should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N
+    :param method_params
+        specific params for each method.
+    :param cores
+        number of parallel processes to work through groupby groups
+
+    :returns: adata with gene set scores in .obs
+    """
+
+    return_data = 1
+    noise_trials = 0  ### not used currently
+    samp_neighbors = None
+
+    #error_checking2(adata, recompute_neighbors, method_params)  # UPDATE
+
+    if method_params == None:
+        method_params = dict()
+
+    # score each cell with the list of gene sets
+    data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors,
+                                  None, method_params, None,
+                                  noise_trials, None, cores, return_data)
+
+    print("**done**")
+    return(data_list)
diff --git a/gssnng/test/test_return_smoothed.py b/gssnng/test/test_return_smoothed.py
new file mode 100644
index 0000000..d75b33c
--- /dev/null
+++ b/gssnng/test/test_return_smoothed.py
@@ -0,0 +1,29 @@
+if __name__ == '__main__':
+
+    import scanpy as sc
+    from gssnng.smooth_anndatas import smooth_anndata
+    import time
+
+    def test_return_smoothed(adata):
+        res0 = smooth_anndata(adata=adata,
+                              groupby='louvain',
+                              smooth_mode='adjacency',
+                              recompute_neighbors=32,
+                              method_params={},
+                              cores=4)
+        return(res0)
+
+
+    def test_score_all_sets():
+        q = sc.datasets.pbmc3k_processed()
+        t0 = time.time()
+        print('start time: ' + str(t0))
+        data_list = test_return_smoothed(q)
+        print('******DONE*******')
+        t1 = time.time()
+        print('end time: ' + str(t1))
+        print('TOTAL TIME: ' + str(t1-t0))
+        print(len(data_list))
+
+    test_score_all_sets()
+    print('test done')

From 9e0cdc034c0500d6eb52795aa19a48628ea4352b Mon Sep 17 00:00:00 2001
From: David L Gibbs <Gibbsdavidl@gmail.com>
Date: Wed, 3 Jan 2024 15:40:10 -0800
Subject: [PATCH 2/4] Fix bug in _proc_data return data param missing

---
 gssnng/score_cells.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py
index 51abd82..3fbd376 100644
--- a/gssnng/score_cells.py
+++ b/gssnng/score_cells.py
@@ -89,7 +89,7 @@ def run_gssnng(
     # score each cell with the list of gene sets
     all_scores = _proc_data(mat, gs_obj, groupby, smooth_mode, recompute_neighbors,
                                   score_method, method_params, samp_neighbors,
-                                  noise_trials, ranked, cores)
+                                  noise_trials, ranked, cores, 0)
 
     # warning: the all_scores rows might have a diferent order!
     # make sure to resort them according to the mat.obs.index
@@ -162,7 +162,7 @@ def with_gene_sets(
     # score each cell with the list of gene sets
     all_scores = _proc_data(adata, gs_obj, groupby, smooth_mode, recompute_neighbors,
                                   score_method, method_params, samp_neighbors,
-                                  noise_trials, ranked, cores)
+                                  noise_trials, ranked, cores, 0)
     ## join in new results
     adata.obs = adata.obs.join(all_scores, how='left')
 

From 69491dd0d04c8e8dfe2a5d677105a18a2757d272 Mon Sep 17 00:00:00 2001
From: David L Gibbs <Gibbsdavidl@gmail.com>
Date: Wed, 3 Jan 2024 17:10:03 -0800
Subject: [PATCH 3/4] updated the doc strings for smooth anndata

---
 gssnng/smooth_anndatas.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py
index 1c0ead6..6f10803 100644
--- a/gssnng/smooth_anndatas.py
+++ b/gssnng/smooth_anndatas.py
@@ -13,12 +13,7 @@ def smooth_anndata(
     ) -> anndata.AnnData:
 
     """
-    gene set scoring (all gene sets in file) with nearest neighbor smoothing of the expression matrix
-
-    Improved single cell scoring by:
-    - smoothing the data matrix
-        - adding noise to the nearest neighbor smoothing via `samp_neighbors`
-    - adding noise to the expression data itself (via noise_trials)
+    nearest neighbor smoothing of the expression matrix
 
     :param adata
         anndata.AnnData containing the cells to be scored
@@ -34,7 +29,7 @@ def smooth_anndata(
     :param cores
         number of parallel processes to work through groupby groups
 
-    :returns: adata with gene set scores in .obs
+    :returns: a list of adatas with smoothed data
     """
 
     return_data = 1

From 51be2a4023dcf61837162980ae33cda403b7cbfd Mon Sep 17 00:00:00 2001
From: gibbsdavidl <gibbsdavidl@gmail.com>
Date: Thu, 4 Jan 2024 11:10:33 -0800
Subject: [PATCH 4/4] added error checking to return smoothed anndatas

---
 gssnng/score_cells.py     |  4 ++--
 gssnng/smooth_anndatas.py | 10 ++++++----
 gssnng/util.py            | 34 +++++++++++++++++++---------------
 3 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py
index 3fbd376..69eae1e 100644
--- a/gssnng/score_cells.py
+++ b/gssnng/score_cells.py
@@ -81,7 +81,7 @@ def run_gssnng(
 
     samp_neighbors = None
     error_checking(mat, samp_neighbors, recompute_neighbors,
-                   gs_obj, score_method, ranked, method_params)
+                   gs_obj, score_method, ranked, method_params, 0)
 
     if method_params == None:
         method_params = dict()
@@ -154,7 +154,7 @@ def with_gene_sets(
 
     samp_neighbors = None
     error_checking(adata, samp_neighbors, recompute_neighbors,
-                   gs_obj, score_method, ranked, method_params)
+                   gs_obj, score_method, ranked, method_params, 0)
 
     if method_params == None:
         method_params = dict()
diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py
index 6f10803..f76f34e 100644
--- a/gssnng/smooth_anndatas.py
+++ b/gssnng/smooth_anndatas.py
@@ -1,6 +1,6 @@
 import anndata
 from gssnng.score_cells import _proc_data
-#from gssnng.util import error_checking
+from gssnng.util import error_checking
 from typing import Union
 
 def smooth_anndata(
@@ -34,16 +34,18 @@ def smooth_anndata(
 
     return_data = 1
     noise_trials = 0  ### not used currently
-    samp_neighbors = None
+    samp_neighbors = None ### also not used
+    just_smoothing=1
 
-    #error_checking2(adata, recompute_neighbors, method_params)  # UPDATE
+    error_checking(adata, samp_neighbors, recompute_neighbors,
+                   None, None, None, method_params, just_smoothing)
 
     if method_params == None:
         method_params = dict()
 
     # score each cell with the list of gene sets
     data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors,
-                                  None, method_params, None,
+                                  None, method_params, samp_neighbors,
                                   noise_trials, None, cores, return_data)
 
     print("**done**")
diff --git a/gssnng/util.py b/gssnng/util.py
index b155e89..5a2a104 100644
--- a/gssnng/util.py
+++ b/gssnng/util.py
@@ -14,7 +14,8 @@ def error_checking(
         gs_obj,
         score_method,
         ranked,
-        method_params
+        method_params,
+        just_smoothing
 ):
     """
     QC on the adata. Need to make sure there's enough neighbors available given the sampling size.
@@ -23,30 +24,33 @@ def error_checking(
     :param samp_neighbors: integer, number of neighbors to sample
     """
 
-    if type(method_params) != type(dict()):
-        raise Exception('ERROR: please use a dictionary to pass method params')
-
-    if any([xi in adata.obs.columns for xi in gs_obj.get_gs_names()]):
-        #raise Exception('ERROR: gene set names in columns of adata.obs, please drop.')
-        print("Warning! Dropping gene set names from obs!")
-        genesetlist = [x.name for x in gs_obj.set_list]
-        for gsi in genesetlist:
-            print('dropping: ' + gsi)
-            adata.obs.drop(columns=[gsi], inplace=True)
-
     if 'gssnng_groupby' in adata.obs.columns:
         adata.obs.drop(columns='gssnng_groupby', inplace=True)
         #raise Exception("Error: please drop 'gssnng_groupby' as a column name.")
         print('... and dropping gssnng_groupby column...')
 
-    if ranked == False and score_method == 'singscore':
-        raise Exception('ERROR: singscore requires ranked data, set ranked parameter to True')
-
     if (recompute_neighbors == None) or (recompute_neighbors == 0):
         n_neighbors = adata.uns['neighbors']['params']['n_neighbors'] #[0]# in older AnnData versions need this??
     else:
         n_neighbors = recompute_neighbors
 
+    if just_smoothing == 0:
+        # then do all other checks
+        if type(method_params) != type(dict()):
+            raise Exception('ERROR: please use a dictionary to pass method params')
+
+        if any([xi in adata.obs.columns for xi in gs_obj.get_gs_names()]):
+            #raise Exception('ERROR: gene set names in columns of adata.obs, please drop.')
+            print("Warning! Dropping gene set names from obs!")
+            genesetlist = [x.name for x in gs_obj.set_list]
+            for gsi in genesetlist:
+                print('dropping: ' + gsi)
+                adata.obs.drop(columns=[gsi], inplace=True)
+
+        if ranked == False and score_method == 'singscore':
+            raise Exception('ERROR: singscore requires ranked data, set ranked parameter to True')
+
+
     #if n_neighbors < samp_neighbors:
     #    print('*******')
     #    print('WARNING: Number of neighbors too low for sampling parameter!')