From 1220307c0559d27b7c915c8a63257b3288ff5c3a Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Sat, 13 Aug 2022 01:15:45 +0800 Subject: [PATCH 01/10] Default Normalization and HVG --- pySingleCellNet/scn_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py index daf1763..7444cca 100644 --- a/pySingleCellNet/scn_train.py +++ b/pySingleCellNet/scn_train.py @@ -35,7 +35,7 @@ def sc_makeClassifier(expTrain, genes, groups, nRand=70, ntrees=2000, stratify=F clf.fit(expT.loc[:,ggenes].to_numpy(), ggroups) return clf -def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=False, normalization = True, include_all_genes = False): +def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False): warnings.filterwarnings('ignore') stTrain= aTrain.obs From 7b0cfc9e32cfb68278373fad9a0fbe0300374ac1 Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Sat, 13 Aug 2022 01:20:20 +0800 Subject: [PATCH 02/10] Add cgenes_list return --- pySingleCellNet/scn_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py index 7444cca..d3b6716 100644 --- a/pySingleCellNet/scn_train.py +++ b/pySingleCellNet/scn_train.py @@ -78,7 +78,7 @@ def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTr pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs) print("Finished pair transforming the data\n") tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify) - return [cgenesA, xpairs, tspRF] + return [cgenesA, xpairs, tspRF, cgenes_list] def scn_classify(adata, cgenes, xpairs, rf_tsp, nrand = 0 ): classRes = scn_predict(cgenes, xpairs, rf_tsp, adata, nrand = nrand) From 3af1db96e024facb5852607eaf0ad900b6916227 Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Wed, 17 Aug 2022 11:32:27 +0800 Subject: [PATCH 03/10] Add cgenes_list return --- pySingleCellNet/tsp_rf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py index 5fbcd62..462d228 100644 --- a/pySingleCellNet/tsp_rf.py +++ b/pySingleCellNet/tsp_rf.py @@ -221,6 +221,4 @@ def findClassyGenes(expDat, sampTab,dLevel, topX=25, dThresh=0, alpha1=0.05,alph cgenes[g]=temp res.append(temp) cgenes2=np.unique(np.array(res).flatten()) - return [cgenes2, grps, cgenes] - - + return [cgenes2, grps, cgenes] \ No newline at end of file From efab8664d62b76faab186eb5cf014e4d0a02499b Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Fri, 19 Aug 2022 02:10:00 +0800 Subject: [PATCH 04/10] 'findClassyGenes_edit' --- pySingleCellNet/scn_train.py | 45 ++++++++++++++++++++++++++++++++++++ pySingleCellNet/tsp_rf.py | 22 +++++++++++++++++- 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py index d3b6716..c191331 100644 --- a/pySingleCellNet/scn_train.py +++ b/pySingleCellNet/scn_train.py @@ -118,3 +118,48 @@ def rf_classPredict(rfObj,expQuery,numRand=50): expQuery=pd.concat([expQuery, randDat]) xpreds= pd.DataFrame(rfObj.predict_proba(expQuery), columns= rfObj.classes_, index=expQuery.index) return xpreds + +def scn_train_edit(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False): + warnings.filterwarnings('ignore') + stTrain= aTrain.obs + + expRaw = aTrain.to_df() + expRaw = expRaw.loc[stTrain.index.values] + + adNorm = aTrain.copy() + if normalization == True: + sc.pp.normalize_per_cell(adNorm, counts_per_cell_after=counts_per_cell_after) + sc.pp.log1p(adNorm) + + print("HVG") + if limitToHVG: + sc.pp.highly_variable_genes(adNorm, min_mean=0.0125, max_mean=4, min_disp=0.5) + adNorm = adNorm[:, adNorm.var.highly_variable] + + sc.pp.scale(adNorm, max_value=scaleMax) + + expTnorm = adNorm.to_df() + expTnorm=expTnorm.loc[stTrain.index.values] + + ### expTnorm= pd.DataFrame(data=aTrain.X, index= aTrain.obs.index.values, columns= aTrain.var.index.values) + ### expTnorm=expTnorm.loc[stTrain.index.values] + print("Matrix normalized") + ### cgenesA, grps, cgenes_list =findClassyGenes(expTnorm,stTrain, dLevel = dLevel, topX = nTopGenes) + if include_all_genes == False: + cgenesA, grps, cgenes_list =findClassyGenes_edit(adNorm, dLevel = dLevel, topX = nTopGenes) + else: + cgenesA = np.array(aTrain.var.index) + grps = aTrain.obs[dLevel] + cgenes_list = dict() + for g in np.unique(grps): + cgenes_list[g] = cgenesA + + print("There are ", len(cgenesA), " classification genes\n") + ### xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000) + xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000) + + print("There are", len(xpairs), "top gene pairs\n") + pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs) + print("Finished pair transforming the data\n") + tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify) + return [cgenesA, xpairs, tspRF, cgenes_list] \ No newline at end of file diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py index 462d228..860bcbb 100644 --- a/pySingleCellNet/tsp_rf.py +++ b/pySingleCellNet/tsp_rf.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import scanpy as sc from sklearn import linear_model from itertools import combinations from .stats import * @@ -221,4 +222,23 @@ def findClassyGenes(expDat, sampTab,dLevel, topX=25, dThresh=0, alpha1=0.05,alph cgenes[g]=temp res.append(temp) cgenes2=np.unique(np.array(res).flatten()) - return [cgenes2, grps, cgenes] \ No newline at end of file + return [cgenes2, grps, cgenes] + +def findClassyGenes_edit(adDat, dLevel, topX=25): + adTemp = adDat.copy() + grps = adDat.obs[dLevel] + groups = np.unique(grps) + + sc.tl.rank_genes_groups(adTemp, dLevel, method='wilcoxon') + tempTab = pd.DataFrame(adTemp.uns['rank_genes_groups']['names']).head(topX) + + res = [] + cgenes = {} + + for g in groups: + temp = tempTab[g] + res.append(temp) + cgenes[g] = temp.to_numpy() + cgenes2 = np.unique(np.array(res).flatten()) + + return [cgenes2, grps, cgenes] From 88f6d027a2997eb07b2e474a7f6645f43c084ac1 Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Fri, 19 Aug 2022 22:09:13 +0800 Subject: [PATCH 05/10] Add a message --- pySingleCellNet/tsp_rf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py index 860bcbb..5ef004e 100644 --- a/pySingleCellNet/tsp_rf.py +++ b/pySingleCellNet/tsp_rf.py @@ -241,4 +241,7 @@ def findClassyGenes_edit(adDat, dLevel, topX=25): cgenes[g] = temp.to_numpy() cgenes2 = np.unique(np.array(res).flatten()) - return [cgenes2, grps, cgenes] + print('new functionality run') + + + return [cgenes2, grps, cgenes] \ No newline at end of file From 8dbb3b6364da8f4bcbcfdf1b0369736c09e14a3c Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Fri, 19 Aug 2022 22:19:52 +0800 Subject: [PATCH 06/10] Add a message --- pySingleCellNet/tsp_rf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py index 5ef004e..182262f 100644 --- a/pySingleCellNet/tsp_rf.py +++ b/pySingleCellNet/tsp_rf.py @@ -242,6 +242,5 @@ def findClassyGenes_edit(adDat, dLevel, topX=25): cgenes2 = np.unique(np.array(res).flatten()) print('new functionality run') - return [cgenes2, grps, cgenes] \ No newline at end of file From 5a64b9e0a9d0e9d95d513866beaee4788314e2eb Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Fri, 19 Aug 2022 22:23:50 +0800 Subject: [PATCH 07/10] New findClassyGenes and scn_train added --- pySingleCellNet/tsp_rf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py index 182262f..8769659 100644 --- a/pySingleCellNet/tsp_rf.py +++ b/pySingleCellNet/tsp_rf.py @@ -241,6 +241,4 @@ def findClassyGenes_edit(adDat, dLevel, topX=25): cgenes[g] = temp.to_numpy() cgenes2 = np.unique(np.array(res).flatten()) - print('new functionality run') - return [cgenes2, grps, cgenes] \ No newline at end of file From b42f768cb51593cc97bce219ccf91aca9ca68bfe Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Sat, 20 Aug 2022 01:49:22 +0800 Subject: [PATCH 08/10] Not to use raw data for rank_genes_groups --- pySingleCellNet/tsp_rf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py index 8769659..dcf244f 100644 --- a/pySingleCellNet/tsp_rf.py +++ b/pySingleCellNet/tsp_rf.py @@ -229,7 +229,7 @@ def findClassyGenes_edit(adDat, dLevel, topX=25): grps = adDat.obs[dLevel] groups = np.unique(grps) - sc.tl.rank_genes_groups(adTemp, dLevel, method='wilcoxon') + sc.tl.rank_genes_groups(adTemp, dLevel, use_raw=False, method='wilcoxon') tempTab = pd.DataFrame(adTemp.uns['rank_genes_groups']['names']).head(topX) res = [] From 9fa2f5cb7dde905a9f77ab25480acf932e69db11 Mon Sep 17 00:00:00 2001 From: RainHan2001 Date: Sat, 20 Aug 2022 01:52:50 +0800 Subject: [PATCH 09/10] Not to use raw data for rank_genes_groups --- pySingleCellNet/tsp_rf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pySingleCellNet/tsp_rf.py b/pySingleCellNet/tsp_rf.py index dcf244f..39f529c 100644 --- a/pySingleCellNet/tsp_rf.py +++ b/pySingleCellNet/tsp_rf.py @@ -236,7 +236,7 @@ def findClassyGenes_edit(adDat, dLevel, topX=25): cgenes = {} for g in groups: - temp = tempTab[g] + temp = tempTab[g] res.append(temp) cgenes[g] = temp.to_numpy() cgenes2 = np.unique(np.array(res).flatten()) From e1d31f241ddc5047b9e9511fa973a27201bfb440 Mon Sep 17 00:00:00 2001 From: RainHan2001 <110891334+RainHan2001@users.noreply.github.com> Date: Tue, 23 Aug 2022 02:43:11 +0800 Subject: [PATCH 10/10] Update scn_train.py --- pySingleCellNet/scn_train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pySingleCellNet/scn_train.py b/pySingleCellNet/scn_train.py index c191331..4c2e9d8 100644 --- a/pySingleCellNet/scn_train.py +++ b/pySingleCellNet/scn_train.py @@ -35,7 +35,7 @@ def sc_makeClassifier(expTrain, genes, groups, nRand=70, ntrees=2000, stratify=F clf.fit(expT.loc[:,ggenes].to_numpy(), ggroups) return clf -def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False): +def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=False, normalization = True, include_all_genes = False): warnings.filterwarnings('ignore') stTrain= aTrain.obs @@ -162,4 +162,4 @@ def scn_train_edit(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100 pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs) print("Finished pair transforming the data\n") tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify) - return [cgenesA, xpairs, tspRF, cgenes_list] \ No newline at end of file + return [cgenesA, xpairs, tspRF, cgenes_list]