Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Edit findclassygenes #5

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
47 changes: 46 additions & 1 deletion pySingleCellNet/scn_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def scn_train(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTr
pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs)
print("Finished pair transforming the data\n")
tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify)
return [cgenesA, xpairs, tspRF]
return [cgenesA, xpairs, tspRF, cgenes_list]

def scn_classify(adata, cgenes, xpairs, rf_tsp, nrand = 0 ):
classRes = scn_predict(cgenes, xpairs, rf_tsp, adata, nrand = nrand)
Expand Down Expand Up @@ -118,3 +118,48 @@ def rf_classPredict(rfObj,expQuery,numRand=50):
expQuery=pd.concat([expQuery, randDat])
xpreds= pd.DataFrame(rfObj.predict_proba(expQuery), columns= rfObj.classes_, index=expQuery.index)
return xpreds

def scn_train_edit(aTrain,dLevel,nTopGenes = 100,nTopGenePairs = 100,nRand = 100, nTrees = 1000,stratify=False,counts_per_cell_after=1e4, scaleMax=10, limitToHVG=True, normalization = True, include_all_genes = False):
warnings.filterwarnings('ignore')
stTrain= aTrain.obs

expRaw = aTrain.to_df()
expRaw = expRaw.loc[stTrain.index.values]

adNorm = aTrain.copy()
if normalization == True:
sc.pp.normalize_per_cell(adNorm, counts_per_cell_after=counts_per_cell_after)
sc.pp.log1p(adNorm)

print("HVG")
if limitToHVG:
sc.pp.highly_variable_genes(adNorm, min_mean=0.0125, max_mean=4, min_disp=0.5)
adNorm = adNorm[:, adNorm.var.highly_variable]

sc.pp.scale(adNorm, max_value=scaleMax)

expTnorm = adNorm.to_df()
expTnorm=expTnorm.loc[stTrain.index.values]

### expTnorm= pd.DataFrame(data=aTrain.X, index= aTrain.obs.index.values, columns= aTrain.var.index.values)
### expTnorm=expTnorm.loc[stTrain.index.values]
print("Matrix normalized")
### cgenesA, grps, cgenes_list =findClassyGenes(expTnorm,stTrain, dLevel = dLevel, topX = nTopGenes)
if include_all_genes == False:
cgenesA, grps, cgenes_list =findClassyGenes_edit(adNorm, dLevel = dLevel, topX = nTopGenes)
else:
cgenesA = np.array(aTrain.var.index)
grps = aTrain.obs[dLevel]
cgenes_list = dict()
for g in np.unique(grps):
cgenes_list[g] = cgenesA

print("There are ", len(cgenesA), " classification genes\n")
### xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000)
xpairs= ptGetTop(expTnorm.loc[:,cgenesA], grps, cgenes_list, topX=nTopGenePairs, sliceSize=5000)

print("There are", len(xpairs), "top gene pairs\n")
pdTrain= query_transform(expRaw.loc[:,cgenesA], xpairs)
print("Finished pair transforming the data\n")
tspRF=sc_makeClassifier(pdTrain.loc[:, xpairs], genes=xpairs, groups=grps, nRand = nRand, ntrees = nTrees, stratify=stratify)
return [cgenesA, xpairs, tspRF, cgenes_list]
20 changes: 19 additions & 1 deletion pySingleCellNet/tsp_rf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn import linear_model
from itertools import combinations
from .stats import *
Expand Down Expand Up @@ -223,4 +224,21 @@ def findClassyGenes(expDat, sampTab,dLevel, topX=25, dThresh=0, alpha1=0.05,alph
cgenes2=np.unique(np.array(res).flatten())
return [cgenes2, grps, cgenes]


def findClassyGenes_edit(adDat, dLevel, topX=25):
adTemp = adDat.copy()
grps = adDat.obs[dLevel]
groups = np.unique(grps)

sc.tl.rank_genes_groups(adTemp, dLevel, use_raw=False, method='wilcoxon')
tempTab = pd.DataFrame(adTemp.uns['rank_genes_groups']['names']).head(topX)

res = []
cgenes = {}

for g in groups:
temp = tempTab[g]
res.append(temp)
cgenes[g] = temp.to_numpy()
cgenes2 = np.unique(np.array(res).flatten())

return [cgenes2, grps, cgenes]