From 35a84e4901197e128fb6f0a157b13463f668b716 Mon Sep 17 00:00:00 2001
From: David L Gibbs <Gibbsdavidl@gmail.com>
Date: Fri, 9 Feb 2024 16:07:19 -0800
Subject: [PATCH] update to docs, examples, and smooth function

---
 docs/decoupler_api_doc.rst              |  2 +-
 docs/gmt_files_doc.rst                  |  2 +-
 docs/smoothing_adatas.rst               | 62 +++++++++++++++++++++++--
 gssnng/nnsmooth.py                      | 51 --------------------
 gssnng/smoothing.py                     | 57 +++++++++++++++++++++++
 gssnng/test/example_smoothing_counts.py | 18 +++----
 6 files changed, 127 insertions(+), 65 deletions(-)
 delete mode 100644 gssnng/nnsmooth.py

diff --git a/docs/decoupler_api_doc.rst b/docs/decoupler_api_doc.rst
index 04c8835..3c993c7 100644
--- a/docs/decoupler_api_doc.rst
+++ b/docs/decoupler_api_doc.rst
@@ -14,9 +14,9 @@ Gene Set Scoring on the Nearest Neighbor Graph (gssnng) for Single Cell RNA-seq
        :maxdepth: 2
 
        Installation
-       Scoring Functions
        Example script
        Usage
+       Scoring Functions
        Parameters
        Groupby
        Gene sets
diff --git a/docs/gmt_files_doc.rst b/docs/gmt_files_doc.rst
index e3d94e7..9b950f1 100644
--- a/docs/gmt_files_doc.rst
+++ b/docs/gmt_files_doc.rst
@@ -14,9 +14,9 @@ Gene Set Scoring on the Nearest Neighbor Graph (gssnng) for Single Cell RNA-seq
        :maxdepth: 2
 
        Installation
-       Scoring Functions
        Example script
        Usage
+       Scoring Functions
        Parameters
        Groupby
        Gene sets
diff --git a/docs/smoothing_adatas.rst b/docs/smoothing_adatas.rst
index 3d4b8e8..c2f239d 100644
--- a/docs/smoothing_adatas.rst
+++ b/docs/smoothing_adatas.rst
@@ -14,7 +14,6 @@ Gene Set Scoring on the Nearest Neighbor Graph (gssnng) for Single Cell RNA-seq
        :maxdepth: 2
 
        Installation
-       Scoring Functions
        Example script
        Usage
        Parameters
@@ -73,13 +72,68 @@ See gssnng/notebooks for examples on all methods.
 
 ::
 
-   from gssnng import nnsmooth
+   from gssnng import smoothing
 
     q = sc.datasets.pbmc3k_processed()
 
-    q_list = nnsmooth.smooth_adata(adata=q,                    # AnnData object
+    q_list = smoothing.smooth_adata(adata=q,                    # AnnData object
                                        groupby='louvain',          # Will sample neighbors within this group, can take a list
                                        smooth_mode='connectivity', # Smooths matrix using distance weights from NN graph.
-                                       recompute_neighbors=32,     # Rebuild nearest neighbor graph with groups, 0 turns off function
+                                       recompute_neighbors=11,     # Rebuild nearest neighbor graph with groups, 0 turns off function
                                        cores=4)                    # Smoothed in parallel.
 
+
+
+Parameters
+==========
+
+These parameters are used with the "scores_cells.with_gene_sets" function.::
+
+    adata:  AnnData object from scanpy.read_*
+    AnnData containing the cells to be scored
+
+    groupby: [str, list, dict]
+    either a column label in adata.obs, and all categories taken, or a dict specifies one group.
+    SEE DESCRIPTION BELOW
+
+    smooth_mode: "adjacency", "connectivity", or "off"
+    Dictates how to use the neighborhood graph.
+    `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more
+
+    recompute_neighbors: int
+    should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N
+
+    cores: int
+    number of parallel processes to work through groupby groups
+
+
+Groupby
+=======
+
+The specific neighborhood for each cell can be controlled by using the groupby parameter. In the example
+above, by setting groupby='louvain', only cells within a louvain cluster will be considered as being part of the
+neighborhood and will available for sampling.
+
+Groupby specifies a column name that's found in the AnnData.obs table, and it can also take a list of column names.
+In that case, cells will be grouped as the intersection of categories. For example, using groupby=['louvain','phenotype']
+will take cells that are first in a given louvain cluster and then also in a given phenotype group. By also setting
+the recompute_neighbors, the nearest neighbor graph is recomputed within this subset of cells. Controlling the
+neighborhood leads to more controlled smoothing of the count matrix and is more suitable for downstream comparisons.
+
+
+References
+==========
+
+rank biased overlap:  https://arxiv.org/pdf/1408.3587.pdf
+
+singscore:  https://pubmed.ncbi.nlm.nih.gov/30400809/
+
+anndata: https://anndata.readthedocs.io/en/latest/
+
+MSigDB: https://www.gsea-msigdb.org/gsea/msigdb/
+
+ssGSEA: https://gsea-msigdb.github.io/ssGSEA-gpmodule/v10/index.html
+
+decoupler: https://academic.oup.com/bioinformaticsadvances/article/2/1/vbac016/6544613
+
+omnipath: https://omnipathdb.org/
diff --git a/gssnng/nnsmooth.py b/gssnng/nnsmooth.py
deleted file mode 100644
index 85b1be9..0000000
--- a/gssnng/nnsmooth.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import anndata
-from gssnng.score_cells import _proc_data
-from gssnng.util import error_checking
-from typing import Union
-
-def smooth_adata(
-        adata: anndata.AnnData,
-        groupby: Union[str, list, dict],
-        smooth_mode: str,
-        recompute_neighbors: int,
-        cores: int
-    ) -> anndata.AnnData:
-
-    """
-    nearest neighbor smoothing of the expression matrix
-
-    :param adata
-        anndata.AnnData containing the cells to be scored
-    :param groupby
-        either a column label in adata.obs, and all categories taken, or a dict specifies one group.
-    :param smooth_mode
-        `adjacency` or `connectivity`, which representation of the neighborhood graph to use.
-        `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more
-    :param recompute_neighbors
-        should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N
-    :param method_params
-        specific params for each method.
-    :param cores
-        number of parallel processes to work through groupby groups
-
-    :returns: a list of adatas with smoothed data
-    """
-
-    return_data = 1
-    noise_trials = 0  ### not used currently
-    samp_neighbors = None ### also not used
-    just_smoothing=1
-
-    error_checking(adata, samp_neighbors, recompute_neighbors,
-                   None, None, None, method_params, just_smoothing)
-
-    if method_params == None:
-        method_params = dict()
-
-    # score each cell with the list of gene sets
-    data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors,
-                                  None, None, samp_neighbors,
-                                  noise_trials, None, cores, return_data)
-
-    print("**done**")
-    return(data_list)
diff --git a/gssnng/smoothing.py b/gssnng/smoothing.py
index 2c02a5c..31265ea 100644
--- a/gssnng/smoothing.py
+++ b/gssnng/smoothing.py
@@ -1,6 +1,11 @@
+#from gssnng.score_cells import _proc_data
+import gssnng
+from gssnng.util import error_checking
+from typing import Union
 import numpy as np
 from scipy import sparse
 import logging
+import anndata
 
 
 NN_DISTANCE_KEY = 'distances'  # scanpy names in .obsp
@@ -10,6 +15,58 @@
 # multiplying should leave a "one-vector" still sum to one
 
 
+# returns a list of adatas, each with a nearest neighbor smoothed expression matrix
+def smooth_adata(
+        adata: anndata.AnnData,
+        groupby: Union[str, list, dict],
+        smooth_mode: str,
+        recompute_neighbors: int,
+        cores: int
+    ) -> anndata.AnnData:
+
+    """
+    returns a list of adatas, each with a nearest neighbor smoothed expression matrix
+
+    :param adata
+        anndata.AnnData containing the cells to be scored
+    :param groupby
+        either a column label in adata.obs, and all categories taken, or a dict specifies one group.
+    :param smooth_mode
+        `adjacency` or `connectivity`, which representation of the neighborhood graph to use.
+        `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more
+    :param recompute_neighbors
+        should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N
+    :param method_params
+        specific params for each method.
+    :param cores
+        number of parallel processes to work through groupby groups
+
+    :returns: a list of adatas with smoothed data
+    """
+
+    return_data = 1
+    noise_trials = 0  ### not used currently
+    samp_neighbors = None ### also not used
+    just_smoothing=1
+
+    # no params for now
+    method_params = dict()
+
+    error_checking(adata, samp_neighbors, recompute_neighbors,
+                   None, None, None, method_params, just_smoothing)
+
+
+    # score each cell with the list of gene sets
+    data_list = gssnng.score_cells._proc_data(adata, None, groupby, smooth_mode, recompute_neighbors,
+                                  None, method_params, samp_neighbors,
+                                  noise_trials, None, cores, return_data)
+
+    print("**done**")
+    return(data_list)
+
+
+
+
 def get_smoothing_matrix(adata, mode, add_diag):
     """
     using the nearest neighbor graph in adata.obsp, calculate the smoothing
diff --git a/gssnng/test/example_smoothing_counts.py b/gssnng/test/example_smoothing_counts.py
index 4f4a529..3ebc929 100644
--- a/gssnng/test/example_smoothing_counts.py
+++ b/gssnng/test/example_smoothing_counts.py
@@ -1,28 +1,30 @@
-if __name__ == '__main__':
+from gssnng import smoothing
+import scanpy as sc
+import time
 
-        from gssnng import nnsmooth
-        import scanpy as sc
-        import time
+if __name__ == '__main__':
 
         print("reading data")
         q = sc.datasets.pbmc3k_processed()
 
         t0 = time.time()
-        print('start time: ' + str(t0))
+        print('starting the smOOthing')
 
-        print("scoring cells")
-        q_list = smooth_anndatas.smooth_anndata(
+        q_list = smoothing.smooth_adata(
                 adata=q,
                 groupby='louvain',
                 smooth_mode='connectivity',
-                recompute_neighbors=0,
+                recompute_neighbors=11,
                 cores=8
             )
 
         t1 = time.time()
 
         print("Adata List with SMooTHed counts.")
+        print("Each is a tuple with groupby category and adata as elements.")
         print(len(q_list))
+        for qi in q_list:
+                print(qi[1] + "  X size: " + str(qi[0].X.shape))
 
         print('end time: ' + str(t1))
         print('TOTAL TIME: ' + str(t1-t0))