From 1220307c0559d27b7c915c8a63257b3288ff5c3a Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Sat, 13 Aug 2022 01:15:45 +0800
Subject: [PATCH 01/10] Default Normalization and HVG

---
 pySingleCellNet/scn_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py
index daf1763..7444cca 100644
--- a/pySingleCellNet/scn_train.py
+++ b/pySingleCellNet/scn_train.py
@@ -35,7 +35,7 @@ def sc_makeClassifier(expTrain, genes, groups, nRand=70, ntrees=2000, stratify=F
     clf.fit(expT.loc[:,ggenes].to_numpy(), ggroups)
     return clf
 
-def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=False, normalization = True, include_all_genes = False):
+def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False):
     warnings.filterwarnings('ignore')
     stTrain= aTrain.obs
     

From 7b0cfc9e32cfb68278373fad9a0fbe0300374ac1 Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Sat, 13 Aug 2022 01:20:20 +0800
Subject: [PATCH 02/10] Add cgenes_list return

---
 pySingleCellNet/scn_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py
index 7444cca..d3b6716 100644
--- a/pySingleCellNet/scn_train.py
+++ b/pySingleCellNet/scn_train.py
@@ -78,7 +78,7 @@ def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTr
     pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs)
     print("Finished pair transforming the data\n")
     tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify)
-    return [cgenesA, xpairs, tspRF]
+    return [cgenesA, xpairs, tspRF, cgenes_list]
 
 def scn_classify(adata, cgenes, xpairs, rf_tsp, nrand = 0 ):
     classRes = scn_predict(cgenes, xpairs, rf_tsp, adata, nrand = nrand)

From 3af1db96e024facb5852607eaf0ad900b6916227 Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Wed, 17 Aug 2022 11:32:27 +0800
Subject: [PATCH 03/10] Add cgenes_list return

---
 pySingleCellNet/tsp_rf.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
index 5fbcd62..462d228 100644
--- a/pySingleCellNet/tsp_rf.py
+++ b/pySingleCellNet/tsp_rf.py
@@ -221,6 +221,4 @@ def findClassyGenes(expDat, sampTab,dLevel, topX=25, dThresh=0, alpha1=0.05,alph
         cgenes[g]=temp
         res.append(temp)
     cgenes2=np.unique(np.array(res).flatten())
-    return [cgenes2, grps, cgenes]
-
-    
+    return [cgenes2, grps, cgenes]
\ No newline at end of file

From efab8664d62b76faab186eb5cf014e4d0a02499b Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Fri, 19 Aug 2022 02:10:00 +0800
Subject: [PATCH 04/10] 'findClassyGenes_edit'

---
 pySingleCellNet/scn_train.py | 45 ++++++++++++++++++++++++++++++++++++
 pySingleCellNet/tsp_rf.py    | 22 +++++++++++++++++-
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py
index d3b6716..c191331 100644
--- a/pySingleCellNet/scn_train.py
+++ b/pySingleCellNet/scn_train.py
@@ -118,3 +118,48 @@ def rf_classPredict(rfObj,expQuery,numRand=50):
         expQuery=pd.concat([expQuery, randDat])
     xpreds= pd.DataFrame(rfObj.predict_proba(expQuery), columns= rfObj.classes_, index=expQuery.index)
     return xpreds
+
+def scn_train_edit(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False):
+    warnings.filterwarnings('ignore')
+    stTrain= aTrain.obs
+    
+    expRaw = aTrain.to_df()
+    expRaw = expRaw.loc[stTrain.index.values]
+
+    adNorm = aTrain.copy()
+    if normalization == True:
+        sc.pp.normalize_per_cell(adNorm, counts_per_cell_after=counts_per_cell_after)
+        sc.pp.log1p(adNorm)
+
+        print("HVG")
+        if limitToHVG:
+            sc.pp.highly_variable_genes(adNorm, min_mean=0.0125, max_mean=4, min_disp=0.5)
+            adNorm = adNorm[:, adNorm.var.highly_variable]
+
+        sc.pp.scale(adNorm, max_value=scaleMax)
+
+    expTnorm = adNorm.to_df()
+    expTnorm=expTnorm.loc[stTrain.index.values]
+
+    ### expTnorm= pd.DataFrame(data=aTrain.X,  index= aTrain.obs.index.values, columns= aTrain.var.index.values)
+    ### expTnorm=expTnorm.loc[stTrain.index.values]
+    print("Matrix normalized")
+    ### cgenesA, grps, cgenes_list =findClassyGenes(expTnorm,stTrain, dLevel = dLevel, topX = nTopGenes)
+    if include_all_genes == False:
+        cgenesA, grps, cgenes_list =findClassyGenes_edit(adNorm, dLevel = dLevel, topX = nTopGenes)
+    else: 
+        cgenesA = np.array(aTrain.var.index)
+        grps = aTrain.obs[dLevel]
+        cgenes_list = dict()
+        for g in np.unique(grps):
+            cgenes_list[g] = cgenesA
+
+    print("There are ", len(cgenesA), " classification genes\n")
+    ### xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000)
+    xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000)
+
+    print("There are", len(xpairs), "top gene pairs\n")
+    pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs)
+    print("Finished pair transforming the data\n")
+    tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify)
+    return [cgenesA, xpairs, tspRF, cgenes_list]
\ No newline at end of file
diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
index 462d228..860bcbb 100644
--- a/pySingleCellNet/tsp_rf.py
+++ b/pySingleCellNet/tsp_rf.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+import scanpy as sc
 from sklearn import linear_model
 from itertools import combinations
 from .stats import * 
@@ -221,4 +222,23 @@ def findClassyGenes(expDat, sampTab,dLevel, topX=25, dThresh=0, alpha1=0.05,alph
         cgenes[g]=temp
         res.append(temp)
     cgenes2=np.unique(np.array(res).flatten())
-    return [cgenes2, grps, cgenes]
\ No newline at end of file
+    return [cgenes2, grps, cgenes]
+
+def findClassyGenes_edit(adDat, dLevel, topX=25):
+    adTemp = adDat.copy()
+    grps = adDat.obs[dLevel]
+    groups = np.unique(grps)
+
+    sc.tl.rank_genes_groups(adTemp, dLevel, method='wilcoxon')
+    tempTab = pd.DataFrame(adTemp.uns['rank_genes_groups']['names']).head(topX)
+
+    res = []
+    cgenes = {}
+
+    for g in groups:
+        temp = tempTab[g]
+        res.append(temp)
+        cgenes[g] = temp.to_numpy()
+    cgenes2 = np.unique(np.array(res).flatten())
+
+    return [cgenes2, grps, cgenes]

From 88f6d027a2997eb07b2e474a7f6645f43c084ac1 Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Fri, 19 Aug 2022 22:09:13 +0800
Subject: [PATCH 05/10] Add a message

---
 pySingleCellNet/tsp_rf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
index 860bcbb..5ef004e 100644
--- a/pySingleCellNet/tsp_rf.py
+++ b/pySingleCellNet/tsp_rf.py
@@ -241,4 +241,7 @@ def findClassyGenes_edit(adDat, dLevel, topX=25):
         cgenes[g] = temp.to_numpy()
     cgenes2 = np.unique(np.array(res).flatten())
 
-    return [cgenes2, grps, cgenes]
+    print('new functionality run')
+    
+
+    return [cgenes2, grps, cgenes]
\ No newline at end of file

From 8dbb3b6364da8f4bcbcfdf1b0369736c09e14a3c Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Fri, 19 Aug 2022 22:19:52 +0800
Subject: [PATCH 06/10] Add a message

---
 pySingleCellNet/tsp_rf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
index 5ef004e..182262f 100644
--- a/pySingleCellNet/tsp_rf.py
+++ b/pySingleCellNet/tsp_rf.py
@@ -242,6 +242,5 @@ def findClassyGenes_edit(adDat, dLevel, topX=25):
     cgenes2 = np.unique(np.array(res).flatten())
 
     print('new functionality run')
-    
 
     return [cgenes2, grps, cgenes]
\ No newline at end of file

From 5a64b9e0a9d0e9d95d513866beaee4788314e2eb Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Fri, 19 Aug 2022 22:23:50 +0800
Subject: [PATCH 07/10] New findClassyGenes and scn_train added

---
 pySingleCellNet/tsp_rf.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
index 182262f..8769659 100644
--- a/pySingleCellNet/tsp_rf.py
+++ b/pySingleCellNet/tsp_rf.py
@@ -241,6 +241,4 @@ def findClassyGenes_edit(adDat, dLevel, topX=25):
         cgenes[g] = temp.to_numpy()
     cgenes2 = np.unique(np.array(res).flatten())
 
-    print('new functionality run')
-
     return [cgenes2, grps, cgenes]
\ No newline at end of file

From b42f768cb51593cc97bce219ccf91aca9ca68bfe Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Sat, 20 Aug 2022 01:49:22 +0800
Subject: [PATCH 08/10] Not to use raw data for rank_genes_groups

---
 pySingleCellNet/tsp_rf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
index 8769659..dcf244f 100644
--- a/pySingleCellNet/tsp_rf.py
+++ b/pySingleCellNet/tsp_rf.py
@@ -229,7 +229,7 @@ def findClassyGenes_edit(adDat, dLevel, topX=25):
     grps = adDat.obs[dLevel]
     groups = np.unique(grps)
 
-    sc.tl.rank_genes_groups(adTemp, dLevel, method='wilcoxon')
+    sc.tl.rank_genes_groups(adTemp, dLevel, use_raw=False, method='wilcoxon')
     tempTab = pd.DataFrame(adTemp.uns['rank_genes_groups']['names']).head(topX)
 
     res = []

From 9fa2f5cb7dde905a9f77ab25480acf932e69db11 Mon Sep 17 00:00:00 2001
From: RainHan2001 <dajewxeg@gmail.com>
Date: Sat, 20 Aug 2022 01:52:50 +0800
Subject: [PATCH 09/10] Not to use raw data for rank_genes_groups

---
 pySingleCellNet/tsp_rf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py
index dcf244f..39f529c 100644
--- a/pySingleCellNet/tsp_rf.py
+++ b/pySingleCellNet/tsp_rf.py
@@ -236,7 +236,7 @@ def findClassyGenes_edit(adDat, dLevel, topX=25):
     cgenes = {}
 
     for g in groups:
-        temp = tempTab[g]
+        temp = tempTab[g] 
         res.append(temp)
         cgenes[g] = temp.to_numpy()
     cgenes2 = np.unique(np.array(res).flatten())

From e1d31f241ddc5047b9e9511fa973a27201bfb440 Mon Sep 17 00:00:00 2001
From: RainHan2001 <110891334+RainHan2001@users.noreply.github.com>
Date: Tue, 23 Aug 2022 02:43:11 +0800
Subject: [PATCH 10/10] Update scn_train.py

---
 pySingleCellNet/scn_train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py
index c191331..4c2e9d8 100644
--- a/pySingleCellNet/scn_train.py
+++ b/pySingleCellNet/scn_train.py
@@ -35,7 +35,7 @@ def sc_makeClassifier(expTrain, genes, groups, nRand=70, ntrees=2000, stratify=F
     clf.fit(expT.loc[:,ggenes].to_numpy(), ggroups)
     return clf
 
-def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False):
+def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=False, normalization = True, include_all_genes = False):
     warnings.filterwarnings('ignore')
     stTrain= aTrain.obs
     
@@ -162,4 +162,4 @@ def scn_train_edit(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100
     pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs)
     print("Finished pair transforming the data\n")
     tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify)
-    return [cgenesA, xpairs, tspRF, cgenes_list]
\ No newline at end of file
+    return [cgenesA, xpairs, tspRF, cgenes_list]