-
Notifications
You must be signed in to change notification settings - Fork 0
/
RF-XGboost.R
150 lines (130 loc) · 6.03 KB
/
RF-XGboost.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
library(xgboost)
library(caret)
library(pROC)
library(caTools)
library(data.table)
library(readr)
library(ggplot2)
#Get args; needs: input file name, name of "case" label from metadata column, # threads, # rounds
args = (commandArgs(TRUE))
### check there are input and output files from command line
if (length(args) < 4) {
stop("Please supply proper arguments: file name, name of 'case' label (from metadata column), # threads, # rounds.", call.=FALSE)
}
### get arguments from command line
infile = args[1]
metadata_label <- args[2]
numThreads <- args[3]
numRounds <- args[4]
#metadata_label <- "TN_Breast_Cancer"
#numThreads <- 8
#numRounds <- 2
setwd("/zgrouphome/fslg_PickettLabGroup/RandomForest/")
#setwd("/Users/bep8/Library/CloudStorage/Box-Box/Work/Proposals/2022/TN_BreastCancer/RF/All")
#infile <- "Naomi_TNBC_original_count_matrix_human.tsv"
#load in file
mydata = as.data.frame(read.table(infile, sep = " "), stringsAsFactors = FALSE)
print("Done reading in data")
#move phenotype column to first
mydata <- mydata[,c(ncol(mydata),1:(ncol(mydata)-1))]
colnames(mydata)[1] <- "Type"
#remove rows where "Type" column is NA
mydata.narf <- as.data.frame(subset(mydata, Type!="NA"), stringsAsFactors = FALSE)
mytemp <- as.data.frame(mydata.narf$Type,drop=FALSE)
#apply z-score normalization across each column of the df
mydata.narf <- as.data.frame(scale(mydata.narf[2:ncol(mydata.narf)]))
mydata.narf <- cbind(mytemp,mydata.narf)
colnames(mydata.narf)[1] <- "Type"
rm(mytemp)
#remove columns with either NA or NaN anywhere in the column
mydata.narf <- mydata.narf[ , colSums(is.na(mydata.narf)) == 0]
print("Done formatting data")
#make the test & train tables
print("Preparing forest analysis")
set.seed(111)
test_size = floor(0.2 * nrow(mydata.narf))
samp = sample(nrow(mydata.narf), test_size,replace = FALSE)
y_train = mydata.narf[-samp,1]
x_train = mydata.narf[-samp,-c(1)]
y_test = mydata.narf[samp,1]
x_test = mydata.narf[samp,-c(1)]
x_train1 <- as.matrix(x_train)
x_test1 <- as.matrix(x_test)
#convert the y's to binary values
#y_train1 <- ifelse(y_train==levels(y_train)[2],1,0)
#y_test1 <- ifelse(y_test==levels(y_test)[2],1,0)
y_train1 <- ifelse(y_train==metadata_label,1,0)
y_test1 <- ifelse(y_test==metadata_label,1,0)
#metadata_label
#convert to xgboost matrices
ex_train = xgb.DMatrix(x_train1, label = y_train1)
ex_test = xgb.DMatrix(x_test1, label = y_test1)
#train model
print("Running Forest Analysis...")
print("Please look for the round number when the AUC value stops decreasing, then re-run using that number for the 'round number' parameter (to prevent over-fitting)")
bstDMatrix <- xgboost(#data = dtrain,
data = ex_train,#$data,
#label = y_train1,#$label,
##booster = "gbtree", #default = gbtree
##max.depth = 4, #default = 6
num_parallel_tree = 10000,
subsample = 0.5,
#colsample_bytree = 0.5,
##eta = 0.3, #range 0-1. Low value is more robust to overfitting; default = 0
##gamma = 0, #minimum loss reduction. Larger number is more conservative; default = 0
##max_depth = 6, #Range 1 to infinity. Maximum depth of tree.
nthread = numThreads, #8
nrounds = numRounds, #2
objective = "binary:logistic",#"reg:linear",
eval_metric = "auc",
set.seed(111),
#prediction = TRUE,
verbose = 1
)
print("Finished Forest Analysis")
#feature importance
print("Calculating Metrics")
importance <- xgb.importance(feature_names = bstDMatrix[["feature_names"]],
model = bstDMatrix)
#gain: improvement in accuracy brought by a feature
#cover: relative quantity of observations concerned by a feature
#frequency: simpler way to measure gain by counting # times feature is used in trees (shouldn't be used)
#head(importance)
write.table(importance, file = paste0(infile, "-Importance_test.tsv"), append = FALSE, row.names=FALSE, na="",col.names=TRUE, sep="\t")
#produce importance figure
pdf("Importance.pdf")
xgb.plot.importance(importance_matrix = importance,
top_n = 20,
plot = TRUE,
)
dev.off()
#generate & save confusion matrix
pred <- predict(bstDMatrix,ex_train)
pred <- as.numeric(pred > 0.5)
cm <- confusionMatrix(factor(pred),factor(y_train1))
#write results to file
#first the confusion matrix
cf_outfile <- paste0(infile, "-Confusion_Matrix_Summary.txt")
write.table("Prediction\tReference", file = cf_outfile, row.names = FALSE, col.names = FALSE)
write.table(cm[["table"]], file = cf_outfile, append = TRUE, row.names=TRUE, col.names=TRUE, sep="\t")
#then save the confusion matrix statistics to the same file
Label <- as.vector(c("Accuracy","95% CI","No Information Rate","P-Value [Acc > NIR]","Kappa","Mcnemar's Test P-Value","Sensitivity","Specificity","Pos Pred Value","Neg Pred Value","Precision","Recall","F1","Prevalence","Detection Rate","Detection Prevalence","Balanced Accuracy","'Positive' Class"))
Value <- as.vector(c(cm[["overall"]][["Accuracy"]],paste0(cm[["overall"]][["AccuracyLower"]]," - ",cm[["overall"]][["AccuracyUpper"]]),cm[["overall"]][["AccuracyNull"]],cm[["overall"]][["AccuracyPValue"]],cm[["overall"]][["Kappa"]],cm[["overall"]][["McnemarPValue"]],cm[["byClass"]][["Sensitivity"]],cm[["byClass"]][["Specificity"]],cm[["byClass"]][["Pos Pred Value"]],cm[["byClass"]][["Neg Pred Value"]],cm[["byClass"]][["Precision"]],cm[["byClass"]][["Recall"]],cm[["byClass"]][["F1"]],cm[["byClass"]][["Prevalence"]],cm[["byClass"]][["Detection Rate"]],cm[["byClass"]][["Detection Prevalence"]],cm[["byClass"]][["Balanced Accuracy"]],cm[["positive"]]))
cf_df <- as.data.frame(cbind(Label,Value))
write.table(cf_df,file = cf_outfile,append=TRUE,col.names = TRUE,row.names = FALSE, sep = "\t")
#Generate ROC for trained model
pdf("ROC.pdf")
roc_test <- roc(y_train1,
pred, algorithm = 6,#auto
plot=TRUE,
print.auc=TRUE,
#ci=TRUE,
#ci.alpha=0.9,
grid=TRUE,
#smooth=TRUE,
max.auc.polygon=TRUE
#transpose = FALSE,
#auc.polygon=TRUE
)
dev.off()
print("Analysis Complete")