-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnested_cv_boosting.R
94 lines (76 loc) · 3.2 KB
/
nested_cv_boosting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
library(gbm)
library(caret)
# Load data
load("class_data.Rdata")
# Define hyperparameter grid for boosting
hyper_grid <- expand.grid(
interaction.depth = c(5, 9, 15, 25),
n.trees = c(500, 1000),
shrinkage = 0.0001,
n.minobsinnode = 10
)
# Define the outer cross-validation folds
outer_folds <- createFolds(y, k = 5, returnTrain = TRUE)
# Initialize a list to store the results
result_list <- list()
# Loop through the outer folds
for (i in seq_along(outer_folds)) {
# Get the training and test data for this fold
train_indices <- unlist(outer_folds[-i])
test_indices <- outer_folds[[i]]
x_train <- x[train_indices, ]
y_train <- y[train_indices]
x_test <- x[test_indices, ]
y_test <- y[test_indices]
# Define the inner cross-validation folds
inner_folds <- createFolds(y_train, k = 10, returnTrain = TRUE)
# Initialize a list to store the inner results
inner_result_list <- list()
# Loop through the inner folds
for (j in seq_along(inner_folds)) {
# Get the training and validation data for this fold
train_indices <- unlist(inner_folds[-j])
val_indices <- inner_folds[[j]]
x_train_inner <- x_train[train_indices, ]
y_train_inner <- y_train[train_indices]
x_val <- x_train[val_indices, ]
y_val <- y_train[val_indices]
# Train a gbm model using this set of hyperparameters
model <- gbm(formula = y_train_inner ~ .,
data= data.frame(x_train_inner, y = y_train_inner),
distribution = "bernoulli",
interaction.depth = hyper_grid$interaction.depth,
n.trees = hyper_grid$n.trees,
shrinkage = hyper_grid$shrinkage,
n.minobsinnode = hyper_grid$n.minobsinnode[1],
bag.fraction = 0.5,
verbose = FALSE)
# Make predictions on the validation set
pred_probs <- predict(model, newdata = x_val, type = "response")
# Compute the AUC for this set of hyperparameters on this fold
auc <- caret::roc(y_val, pred_probs)$auc
# Add the result to the inner result list
inner_result_list[[j]] <- list(auc = auc, hyperparameters = hyper_grid)
}
# Find the set of hyperparameters that gave the highest average AUC across the inner folds
inner_results <- bind_rows(inner_result_list)
best_hyperparameters <- inner_results %>%
group_by_all() %>%
summarize(mean_auc = mean(auc), .groups = "drop") %>%
arrange(desc(mean_auc)) %>%
slice(1) %>%
pull(hyperparameters)
# Train a gbm model using the best hyperparameters on the full training set for this fold
final_model <- gbm(formula = y_train ~ .,
data = data.frame(x_train, y = y_train),
distribution = "bernoulli",
interaction.depth = best_hyperparameters$interaction.depth,
n.trees = best_hyperparameters$n.trees,
shrinkage = best_hyperparameters$shrinkage,
n.minobsinnode = best_hyperparameters$n.minobsinnode,
verbose = FALSE
)
# Evaluate the final model on the test set
y_pred <- predict(final_model, newdata = data.frame(x_test),type="response")
auc <- caret::roc(y_test, y_pred)$auc
}