CahanLab · RainHan2001 · Aug 12, 2022 · Aug 12, 2022 · Aug 17, 2022 · Aug 18, 2022
diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py
@@ -78,7 +78,7 @@ def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTr
     pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs)
     print("Finished pair transforming the data\n")
     tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify)
-    return [cgenesA, xpairs, tspRF]
+    return [cgenesA, xpairs, tspRF, cgenes_list]
 
 def scn_classify(adata, cgenes, xpairs, rf_tsp, nrand = 0 ):
     classRes = scn_predict(cgenes, xpairs, rf_tsp, adata, nrand = nrand)
@@ -118,3 +118,48 @@ def rf_classPredict(rfObj,expQuery,numRand=50):
         expQuery=pd.concat([expQuery, randDat])
     xpreds= pd.DataFrame(rfObj.predict_proba(expQuery), columns= rfObj.classes_, index=expQuery.index)
     return xpreds
+
+def scn_train_edit(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False):
+    warnings.filterwarnings('ignore')
+    stTrain= aTrain.obs
+
+    expRaw = aTrain.to_df()
+    expRaw = expRaw.loc[stTrain.index.values]
+
+    adNorm = aTrain.copy()
+    if normalization == True:
+        sc.pp.normalize_per_cell(adNorm, counts_per_cell_after=counts_per_cell_after)
+        sc.pp.log1p(adNorm)
+
+        print("HVG")
+        if limitToHVG:
+            sc.pp.highly_variable_genes(adNorm, min_mean=0.0125, max_mean=4, min_disp=0.5)
+            adNorm = adNorm[:, adNorm.var.highly_variable]
+
+        sc.pp.scale(adNorm, max_value=scaleMax)
+
+    expTnorm = adNorm.to_df()
+    expTnorm=expTnorm.loc[stTrain.index.values]
+
+    ### expTnorm= pd.DataFrame(data=aTrain.X,  index= aTrain.obs.index.values, columns= aTrain.var.index.values)
+    ### expTnorm=expTnorm.loc[stTrain.index.values]
+    print("Matrix normalized")
+    ### cgenesA, grps, cgenes_list =findClassyGenes(expTnorm,stTrain, dLevel = dLevel, topX = nTopGenes)
+    if include_all_genes == False:
+        cgenesA, grps, cgenes_list =findClassyGenes_edit(adNorm, dLevel = dLevel, topX = nTopGenes)
+    else: 
+        cgenesA = np.array(aTrain.var.index)
+        grps = aTrain.obs[dLevel]
+        cgenes_list = dict()
+        for g in np.unique(grps):
+            cgenes_list[g] = cgenesA
+
+    print("There are ", len(cgenesA), " classification genes\n")
+    ### xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000)
+    xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000)
+
+    print("There are", len(xpairs), "top gene pairs\n")
+    pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs)
+    print("Finished pair transforming the data\n")
+    tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify)
+    return [cgenesA, xpairs, tspRF, cgenes_list]
diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+import scanpy as sc
 from sklearn import linear_model
 from itertools import combinations
 from .stats import * 
@@ -223,4 +224,21 @@ def findClassyGenes(expDat, sampTab,dLevel, topX=25, dThresh=0, alpha1=0.05,alph
     cgenes2=np.unique(np.array(res).flatten())
     return [cgenes2, grps, cgenes]
 
-
+def findClassyGenes_edit(adDat, dLevel, topX=25):
+    adTemp = adDat.copy()
+    grps = adDat.obs[dLevel]
+    groups = np.unique(grps)
+
+    sc.tl.rank_genes_groups(adTemp, dLevel, use_raw=False, method='wilcoxon')
+    tempTab = pd.DataFrame(adTemp.uns['rank_genes_groups']['names']).head(topX)
+
+    res = []
+    cgenes = {}
+
+    for g in groups:
+        temp = tempTab[g] 
+        res.append(temp)
+        cgenes[g] = temp.to_numpy()
+    cgenes2 = np.unique(np.array(res).flatten())
+
+    return [cgenes2, grps, cgenes]