-
Notifications
You must be signed in to change notification settings - Fork 1
/
CV_H3N2.R
122 lines (112 loc) · 4.74 KB
/
CV_H3N2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
source("~/CrossValidation.R")
load("~/flutree2018-5.Rdata")
tree=flutree
allHeights=node.depth.edgelength(tree)
allD=allDescendants(tree)
df=read.csv("~/df_2018-5.csv",sep= ",",header=T,stringsAsFactors=FALSE)
df=df[,2:ncol(df)]
names(df)=c("Clade","numberTipsClade","numberTipsTrimmed","sackin","colless","Variance","I2","B1","B2",
"avgLadder","ILnumber","pitchforks","maxHeight","MaxWidth","DelW","Stairs1","Stairs2","Cherries",
"BS","descinm","getstattest","skewness","kurtosis","MeanPairwiseDist","MaxPairwiseDist", "diameter",
"WienerIndex", "betweenness", "closeness", "eigenvector","MeadianEp","MaxEp","MeanEp",
"numberTipsTrimmed_3.4","Labels")
ind=which(df$Labels <= 1.1)
length(ind)
df$Labels[ind]=0
df$Labels[-ind]=1
round(length(which(df$Labels==1))/nrow(df),1)
#find the clades which their root is in the last 3.4 years of the tree and their labels are 0
#we are not sure about the labels of these clades since they do not have enough time to growth
res=numeric()
for(i in 1:nrow(df)){
if(df$Labels[i]==0){
print(i)
root=df$Clade[i]
if( max(allHeights)-allHeights[root]<=3.4){res=c(res,i)}
}
}
df=df[-res,]
Clades=df$Clade
df=df[,3:ncol(df)]
df=df[,-32]
ind=which(df$Labels==1)
df$Labels[ind]=0
df$Labels[-ind]=1
#================================================================================================
#================================================================================================
kernel="linear"
r=numeric()
data=Preparedf(df)
folds <- cut(seq(1,nrow(data)),breaks=10,labels=FALSE)
for(cost in c(2^c(-7:7))){
for(gamma in c(2^c(-7:7))){
result_i=numeric()
for(j in 1:10){
print(j)
testIndexes <- which(folds==j,arr.ind=TRUE)
test <- data[testIndexes,]
train <- data[-testIndexes, ]
print(gamma)
print(cost)
svm.fit = svm(data = train, Labels ~ .,
kernel =kernel, degree = 3, gamma = gamma,
coef0 = 0, cost =cost, nu = 0.5, class.weigth=c("0"=0.5,"1"=0.5))
svm.prob <- predict(svm.fit, newdata = test)
agreement <- svm.prob == test$Labels
acc=length(which(svm.prob == test$Labels))/length(test$Labels)
svmmodel.predict<-predict(svm.fit, newdata = test,decision.values=TRUE)
svmmodel.probs<-attr(svmmodel.predict,"decision.values")
svmmodel.class<-predict(svm.fit,test,type="class")
svmmodel.labels<-test$Labels
#roc analysis for test data
svmmodel.prediction<-prediction(svmmodel.probs,svmmodel.labels)
svmmodel.performance<-performance(svmmodel.prediction,"tpr","fpr")
svmmodel.auc<-performance(svmmodel.prediction,"auc")@y.values[[1]]
svmmodel.auc
res=c(acc,svmmodel.auc)
result_i=rbind(result_i,res)
}
r=rbind(r,c(mean(result_i[,1]),mean(result_i[,2]),gamma,cost))
}
}
#================================================================================================
#compute the prediction label for each clade in cross validation
kernel="linear"
r=numeric()
set.seed(123)
data=Preparedf(df)
data_clade=data[,1]
folds <- cut(seq(1,nrow(data)),breaks=10,labels=FALSE)
col=c("red","blue","green","yellow","violet","tomato","orange3","magenta","gray64","goldenrod4")
result_i=numeric()
labels=numeric()
for(j in 1:10){
print(j)
testIndexes <- which(folds==j,arr.ind=TRUE)
test <- data[testIndexes,2:ncol(data) ]
train <- data[-testIndexes,2:ncol(data) ]
gamma=0.03125
cost=64
svm.fit = svm(data = train, Labels ~ .,
kernel =kernel, degree = 3, gamma = gamma,
coef0 = 0, cost =cost, nu = 0.5, class.weigth=c("0"=0.5,"1"=0.5))
svm.prob <- predict(svm.fit, newdata = test)
agreement <- svm.prob == test$Labels
acc=length(which(svm.prob == test$Labels))/length(test$Labels)
svmmodel.predict<-predict(svm.fit, newdata = test,decision.values=TRUE)
svmmodel.probs<-attr(svmmodel.predict,"decision.values")
svmmodel.class<-predict(svm.fit,test,type="class")
svmmodel.labels<-test$Labels
#roc analysis for test data
svmmodel.prediction<-prediction(svmmodel.probs,svmmodel.labels)
svmmodel.performance<-performance(svmmodel.prediction,"tpr","fpr")
svmmodel.auc<-performance(svmmodel.prediction,"auc")@y.values[[1]]
svmmodel.auc
plot(svmmodel.performance,type="l", col=col[j])
par(new=TRUE)
res=c(acc,svmmodel.auc)
result_i=rbind(result_i,res)
labels=c(labels,svm.prob )
}
legend("bottomright", legend=c("fold1:0.73","fold2: 0.78","fold3: 0.89","fold4: 0.84","fold5: 0.90","fold6:0.80","fold7:0.90","fold8:0.80",
"fold9: 0.74","fold10: 0.77"), fill=c("red","blue","green","yellow","violet","tomato","orange3","magenta","gray64","goldenrod4"), cex=0.5)