forked from sankalpchap1/SPRING23-DATA-MINING
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgbTree.R
97 lines (80 loc) · 2.49 KB
/
xgbTree.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
library(rstudioapi)
library(dplyr)
current_path = rstudioapi::getActiveDocumentContext()$path
setwd(dirname(current_path ))
rm(list=ls())
library(xgboost)
library(caret)
# load data
load("class_data.RData")
# split data into training and testing sets
set.seed(123)
y <- as.factor(y)
levels(y) <-make.names(y)
train <- sample(1:nrow(x), nrow(x) * 0.7)
X_train <- x[train, ]
y_train <- y[train]
X_test <- x[-train, ]
y_test <- y[-train]
# define cross-validation method
ctrl <- trainControl(method = "cv",
number = 10,
classProbs = TRUE,
savePredictions = "final")
# Set up cross-validation and hyperparameter tuning
# xgb_grid <- expand.grid(
# nrounds = c(100,200),
# max_depth = c(5,9, 15),
# eta = c(0.1, 0.01, 0.001),
# gamma = c(0,1),
# colsample_bytree = c(0.8, 0.9),
# min_child_weight = c(1,5, 10),
# subsample = c(0.1, 0.3, 0.5, 0.7, 0.9)
# )
# Set up cross-validation and hyperparameter tuning
xgb_grid <- expand.grid(
nrounds = c(100,200),
max_depth = c(5,9, 15),
eta = c(0.1, 0.01, 0.001),
gamma = c(0,1),
colsample_bytree = c(0.3, 0.5, 0.7, 0.9),
min_child_weight = c(1, 5, 10),
subsample = c(0.1, 0.3, 0.5, 0.7, 0.9)
)
start = Sys.time()
# train xgboost model with feature selection
xgb_model <- train(x = X_train,
y = y_train,
method = "xgbTree",
trControl = ctrl,
verbose = TRUE,
tuneGrid = xgb_grid,
metric = "ROC",
verbosity = 0
)
end = Sys.time()
# # Save the model
end_date = format(end, "%Y_%m_%d")
file_name = paste0("xgbTree_model_",end_date,".RData")
#
save(xgb_model, file = file_name)
# load(file_name)
# get feature importance
importance <- xgb.importance(dimnames(X_train)[[2]], model = xgb_model$finalModel)
# plot feature importance
xgb.plot.importance(importance)
# make predictions on test set
xgb_predictions <- predict(xgb_model, newdata = X_test, iteration_range = c(1, 50))
# evaluate performance
confusionMatrix(xgb_predictions, y_test)
library(ROCR)
PredBoosting <- predict(xgb_model, X_test,type = "prob")
prediction <- prediction(PredBoosting[2],y_test)
performance <- performance(prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# area under curve
aucXGBoost <- performance(prediction, measure = "auc")
aucXGBoost <- [email protected][[1]]
aucXGBoost