-
Notifications
You must be signed in to change notification settings - Fork 0
/
RF-AUC_ROC-severity_.R
128 lines (107 loc) · 4.44 KB
/
RF-AUC_ROC-severity_.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
library(ggplot2)
library(corrplot)
library(reshape2)
library(ggthemes)
library(dplyr)
library(randomForest)
library(ROCR)
library(readxl)
library(tidyverse)
#options(expression = 1000000)
#load in file
setwd("~/Library/CloudStorage/Box-Box/Work/Lab/SARS-CoV2/SARS2-severity")
#setwd("/fslhome/bep8/fsl_groups/fslg_PickettLabGroup/RandomForest")
#load(file = "TCGAgeneMaster_RFinput.rda")
#setwd("~/Desktop/RF")
#infile <- "VAccine H+H Final March 25 numbers (David Edits)-bep.txt"
infile <- "count_matrix_human.excell.xlsx"
mydata <- read_excel(path = infile)
##concatenate row1 w/ colnames
mydata1 <- colnames(mydata)
mydata1 <- c("SRR_ID",mydata1)
mydata1 <- gsub("-","_",mydata1)
mydata1 <- as.character(mydata1)
colnames(mydata) <- mydata1
#mydata <- mydata[-c(1,2),]
#mydata <- mydata[-1,]
rm(mydata1)
mydata <- mydata[,c(ncol(mydata),1:(ncol(mydata)-1))] #move phenotype column to first
colnames(mydata)[1] <- "Type"
mydata.narf <- as.data.frame(subset(mydata, Type!="NA"))
mydata.narf$"44256" <- NULL #number in colname
mydata.narf$"44257" <- NULL #number in colname
#mydata.narf[,1366] <- NULL #breaks rf
#mydata.narf$"BMP15" <- NULL #NaN
mydata.narf$SRR_ID <- NULL
#mydata.narf1 <- t(na.omit(t(mydata.narf)))
#mydata.narf1 <- as.data.frame(mydata.narf1)
mydata.narf$Type <- gsub("low","l",mydata.narf$Type)
mydata.narf$Type <- gsub("high","h",mydata.narf$Type)
mytemp <- as.vector(mydata.narf$Type)
mydata.narf <- as.data.frame(scale(mydata.narf[2:ncol(mydata.narf)]))
mydata.narf <- cbind(mytemp,mydata.narf)
colnames(mydata.narf)[1] <- "Type"
#remove columns with either NA or NaN anywhere in the column
mydata.narf <- mydata.narf[ , colSums(is.na(mydata.narf)) == 0]
mydata.narf <- mydata.narf[ , colSums(is.nan(mydata.narf)) == 0]
#colnames(mydata) <- gsub("[?|.| |-|/|'|(|)|’|,|+]","_",colnames(mydata))#remove bad characters
##sapply(mydata.na, class)
#mydata.narf <- na.roughfix(mydata.na) #impute values using column medians, where original values were "NA"
#mydata.narf2 <- mydata.narf
#mydata.narf <- mydata.narf2
#mydata.narf <- mydata.narf[,c(1,1350:1367)]
set.seed(111)
test_size = floor(0.3 * nrow(mydata.narf))
samp = sample(nrow(mydata.narf), test_size,replace = FALSE)
y_train = mydata.narf[-samp,1]
x_train = mydata.narf[-samp,-c(1)]
y_test = mydata.narf[samp,1]
x_test = mydata.narf[samp,-c(1)]
#convert labels to categorical
#y_train = as.factor(y_train)
#y_test = as.factorfactor(y_test)
#Create training set and testing set
train = cbind(y_train,x_train)
test = cbind(y_test,x_test)
colnames(train)[1] = "Type"
colnames(test)[1] = "Type"
train <- as.data.frame(na.omit(train))
test <- as.data.frame(na.omit(test))
numTry <- sqrt(ncol(mydata.narf))
#mydata.narf$Type <- as.character(mydata.narf$Type)
model_1 = randomForest(Type~., data = train, ntree = 10000, importance = TRUE, do.trace = 10, proximity = TRUE, mtry = numTry)#10000
save(model_1, file = "RF-model.rda")
print(model_1)
pred_1 = predict(model_1, x_test)
summary_table <- table(y_test, pred_1)
print("Summary Table: ")
print(summary_table)
accuracy_m1 = mean(y_test == pred_1)
paste0("Accuracy: ",accuracy_m1)
varImpPlot(model_1)
importance = importance(model_1)
varImportance = data.frame(Features = row.names(importance),
Mean_Decrease_Gini =round(importance[, "MeanDecreaseGini"],2))
write.table(varImportance, file = paste0("varImportance_",numTry,"-Importance_test.tsv"), append = FALSE, row.names=TRUE, na="",col.names=TRUE, sep="\t")
#varImportance = data.frame(Variables = row.names(importance),
# Importance =round(importance[, "MeanDecreaseAccuracy"],2))
rankImportance=varImportance%>%mutate(Rank=paste("#",dense_rank(desc(Mean_Decrease_Gini))))
ggplot(rankImportance,aes(x=reorder(Features,Mean_Decrease_Gini),
y=Mean_Decrease_Gini,fill=Mean_Decrease_Gini))+
geom_bar(stat="Identity") +
geom_text(aes(x = Features, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = "white") +
labs(x = "Features") +
coord_flip() +
theme_classic()
outfile_name <- paste0("Gini_Bars-",numTry,"test.pdf")
ggsave(outfile_name)
#Generate AUC data and ROC curve
predictions=as.vector(model_1$votes[,2])
pred=prediction(predictions,train$Type)
perf_AUC=performance(pred,"auc") #Calculate the AUC value
[email protected][[1]]
paste0("AUC: ", AUC)
perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
plot(perf_ROC, main="ROC plot")
text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))