From 35a84e4901197e128fb6f0a157b13463f668b716 Mon Sep 17 00:00:00 2001 From: David L Gibbs Date: Fri, 9 Feb 2024 16:07:19 -0800 Subject: [PATCH] update to docs, examples, and smooth function --- docs/decoupler_api_doc.rst | 2 +- docs/gmt_files_doc.rst | 2 +- docs/smoothing_adatas.rst | 62 +++++++++++++++++++++++-- gssnng/nnsmooth.py | 51 -------------------- gssnng/smoothing.py | 57 +++++++++++++++++++++++ gssnng/test/example_smoothing_counts.py | 18 +++---- 6 files changed, 127 insertions(+), 65 deletions(-) delete mode 100644 gssnng/nnsmooth.py diff --git a/docs/decoupler_api_doc.rst b/docs/decoupler_api_doc.rst index 04c8835..3c993c7 100644 --- a/docs/decoupler_api_doc.rst +++ b/docs/decoupler_api_doc.rst @@ -14,9 +14,9 @@ Gene Set Scoring on the Nearest Neighbor Graph (gssnng) for Single Cell RNA-seq :maxdepth: 2 Installation - Scoring Functions Example script Usage + Scoring Functions Parameters Groupby Gene sets diff --git a/docs/gmt_files_doc.rst b/docs/gmt_files_doc.rst index e3d94e7..9b950f1 100644 --- a/docs/gmt_files_doc.rst +++ b/docs/gmt_files_doc.rst @@ -14,9 +14,9 @@ Gene Set Scoring on the Nearest Neighbor Graph (gssnng) for Single Cell RNA-seq :maxdepth: 2 Installation - Scoring Functions Example script Usage + Scoring Functions Parameters Groupby Gene sets diff --git a/docs/smoothing_adatas.rst b/docs/smoothing_adatas.rst index 3d4b8e8..c2f239d 100644 --- a/docs/smoothing_adatas.rst +++ b/docs/smoothing_adatas.rst @@ -14,7 +14,6 @@ Gene Set Scoring on the Nearest Neighbor Graph (gssnng) for Single Cell RNA-seq :maxdepth: 2 Installation - Scoring Functions Example script Usage Parameters @@ -73,13 +72,68 @@ See gssnng/notebooks for examples on all methods. :: - from gssnng import nnsmooth + from gssnng import smoothing q = sc.datasets.pbmc3k_processed() - q_list = nnsmooth.smooth_adata(adata=q, # AnnData object + q_list = smoothing.smooth_adata(adata=q, # AnnData object groupby='louvain', # Will sample neighbors within this group, can take a list smooth_mode='connectivity', # Smooths matrix using distance weights from NN graph. - recompute_neighbors=32, # Rebuild nearest neighbor graph with groups, 0 turns off function + recompute_neighbors=11, # Rebuild nearest neighbor graph with groups, 0 turns off function cores=4) # Smoothed in parallel. + + +Parameters +========== + +These parameters are used with the "scores_cells.with_gene_sets" function.:: + + adata: AnnData object from scanpy.read_* + AnnData containing the cells to be scored + + groupby: [str, list, dict] + either a column label in adata.obs, and all categories taken, or a dict specifies one group. + SEE DESCRIPTION BELOW + + smooth_mode: "adjacency", "connectivity", or "off" + Dictates how to use the neighborhood graph. + `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more + + recompute_neighbors: int + should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N + + cores: int + number of parallel processes to work through groupby groups + + +Groupby +======= + +The specific neighborhood for each cell can be controlled by using the groupby parameter. In the example +above, by setting groupby='louvain', only cells within a louvain cluster will be considered as being part of the +neighborhood and will available for sampling. + +Groupby specifies a column name that's found in the AnnData.obs table, and it can also take a list of column names. +In that case, cells will be grouped as the intersection of categories. For example, using groupby=['louvain','phenotype'] +will take cells that are first in a given louvain cluster and then also in a given phenotype group. By also setting +the recompute_neighbors, the nearest neighbor graph is recomputed within this subset of cells. Controlling the +neighborhood leads to more controlled smoothing of the count matrix and is more suitable for downstream comparisons. + + +References +========== + +rank biased overlap: https://arxiv.org/pdf/1408.3587.pdf + +singscore: https://pubmed.ncbi.nlm.nih.gov/30400809/ + +anndata: https://anndata.readthedocs.io/en/latest/ + +MSigDB: https://www.gsea-msigdb.org/gsea/msigdb/ + +ssGSEA: https://gsea-msigdb.github.io/ssGSEA-gpmodule/v10/index.html + +decoupler: https://academic.oup.com/bioinformaticsadvances/article/2/1/vbac016/6544613 + +omnipath: https://omnipathdb.org/ diff --git a/gssnng/nnsmooth.py b/gssnng/nnsmooth.py deleted file mode 100644 index 85b1be9..0000000 --- a/gssnng/nnsmooth.py +++ /dev/null @@ -1,51 +0,0 @@ -import anndata -from gssnng.score_cells import _proc_data -from gssnng.util import error_checking -from typing import Union - -def smooth_adata( - adata: anndata.AnnData, - groupby: Union[str, list, dict], - smooth_mode: str, - recompute_neighbors: int, - cores: int - ) -> anndata.AnnData: - - """ - nearest neighbor smoothing of the expression matrix - - :param adata - anndata.AnnData containing the cells to be scored - :param groupby - either a column label in adata.obs, and all categories taken, or a dict specifies one group. - :param smooth_mode - `adjacency` or `connectivity`, which representation of the neighborhood graph to use. - `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more - :param recompute_neighbors - should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N - :param method_params - specific params for each method. - :param cores - number of parallel processes to work through groupby groups - - :returns: a list of adatas with smoothed data - """ - - return_data = 1 - noise_trials = 0 ### not used currently - samp_neighbors = None ### also not used - just_smoothing=1 - - error_checking(adata, samp_neighbors, recompute_neighbors, - None, None, None, method_params, just_smoothing) - - if method_params == None: - method_params = dict() - - # score each cell with the list of gene sets - data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors, - None, None, samp_neighbors, - noise_trials, None, cores, return_data) - - print("**done**") - return(data_list) diff --git a/gssnng/smoothing.py b/gssnng/smoothing.py index 2c02a5c..31265ea 100644 --- a/gssnng/smoothing.py +++ b/gssnng/smoothing.py @@ -1,6 +1,11 @@ +#from gssnng.score_cells import _proc_data +import gssnng +from gssnng.util import error_checking +from typing import Union import numpy as np from scipy import sparse import logging +import anndata NN_DISTANCE_KEY = 'distances' # scanpy names in .obsp @@ -10,6 +15,58 @@ # multiplying should leave a "one-vector" still sum to one +# returns a list of adatas, each with a nearest neighbor smoothed expression matrix +def smooth_adata( + adata: anndata.AnnData, + groupby: Union[str, list, dict], + smooth_mode: str, + recompute_neighbors: int, + cores: int + ) -> anndata.AnnData: + + """ + returns a list of adatas, each with a nearest neighbor smoothed expression matrix + + :param adata + anndata.AnnData containing the cells to be scored + :param groupby + either a column label in adata.obs, and all categories taken, or a dict specifies one group. + :param smooth_mode + `adjacency` or `connectivity`, which representation of the neighborhood graph to use. + `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more + :param recompute_neighbors + should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N + :param method_params + specific params for each method. + :param cores + number of parallel processes to work through groupby groups + + :returns: a list of adatas with smoothed data + """ + + return_data = 1 + noise_trials = 0 ### not used currently + samp_neighbors = None ### also not used + just_smoothing=1 + + # no params for now + method_params = dict() + + error_checking(adata, samp_neighbors, recompute_neighbors, + None, None, None, method_params, just_smoothing) + + + # score each cell with the list of gene sets + data_list = gssnng.score_cells._proc_data(adata, None, groupby, smooth_mode, recompute_neighbors, + None, method_params, samp_neighbors, + noise_trials, None, cores, return_data) + + print("**done**") + return(data_list) + + + + def get_smoothing_matrix(adata, mode, add_diag): """ using the nearest neighbor graph in adata.obsp, calculate the smoothing diff --git a/gssnng/test/example_smoothing_counts.py b/gssnng/test/example_smoothing_counts.py index 4f4a529..3ebc929 100644 --- a/gssnng/test/example_smoothing_counts.py +++ b/gssnng/test/example_smoothing_counts.py @@ -1,28 +1,30 @@ -if __name__ == '__main__': +from gssnng import smoothing +import scanpy as sc +import time - from gssnng import nnsmooth - import scanpy as sc - import time +if __name__ == '__main__': print("reading data") q = sc.datasets.pbmc3k_processed() t0 = time.time() - print('start time: ' + str(t0)) + print('starting the smOOthing') - print("scoring cells") - q_list = smooth_anndatas.smooth_anndata( + q_list = smoothing.smooth_adata( adata=q, groupby='louvain', smooth_mode='connectivity', - recompute_neighbors=0, + recompute_neighbors=11, cores=8 ) t1 = time.time() print("Adata List with SMooTHed counts.") + print("Each is a tuple with groupby category and adata as elements.") print(len(q_list)) + for qi in q_list: + print(qi[1] + " X size: " + str(qi[0].X.shape)) print('end time: ' + str(t1)) print('TOTAL TIME: ' + str(t1-t0))