From 631fe3c277bbeb601e90954358cc62fd62c6d6aa Mon Sep 17 00:00:00 2001 From: Max Kuhn Date: Tue, 16 Jul 2024 08:49:43 -0400 Subject: [PATCH] Two layer neural networks (#80) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * new function for two layer models * two layer unit tests * avoid overflow issues * fix doc error * test when loss cannot be computed * tests for tunable values * update GHA * update spelling * use current CRAN torch * add missing snapshot and remove print snapshots (due to OS differences) * don't test on R 4.1 * test overflow on M1 mac --------- Co-authored-by: ‘topepo’ <‘mxkuhn@gmail.com’> --- .github/workflows/R-CMD-check.yaml | 16 +- .github/workflows/test-coverage.yaml | 23 +- DESCRIPTION | 4 +- NAMESPACE | 7 + NEWS.md | 2 + R/mlp-fit.R | 1318 +++++++++++++++---------- R/tunable.R | 33 +- inst/WORDLIST | 2 +- man/brulee-internal.Rd | 3 + man/brulee_mlp.Rd | 121 ++- tests/testthat/_snaps/overflow.md | 22 + tests/testthat/_snaps/tunable.md | 429 ++++++++ tests/testthat/test-mlp-multinomial.R | 2 +- tests/testthat/test-mlp-regression.R | 107 +- tests/testthat/test-overflow.R | 20 + tests/testthat/test-tunable.R | 8 + 16 files changed, 1562 insertions(+), 555 deletions(-) create mode 100644 tests/testthat/_snaps/overflow.md create mode 100644 tests/testthat/_snaps/tunable.md create mode 100644 tests/testthat/test-overflow.R create mode 100644 tests/testthat/test-tunable.R diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index fb81baa..67dc39b 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -12,6 +12,8 @@ on: name: R-CMD-check +permissions: read-all + jobs: R-CMD-check: runs-on: ${{ matrix.config.os }} @@ -26,10 +28,12 @@ jobs: - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} - - {os: ubuntu-latest, r: 'oldrel-2'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + - {os: ubuntu-latest, r: 'oldrel-2'} + - {os: ubuntu-latest, r: 'oldrel-3'} + - {os: ubuntu-latest, r: 'oldrel-4'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -37,7 +41,7 @@ jobs: TORCH_INSTALL: 1 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 @@ -55,4 +59,4 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true - args: 'c("--no-multiarch", "--no-manual")' + build_args: 'c("--no-multiarch", "--no-manual","--compact-vignettes=gs+qpdf")' diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 04d5631..f2532ef 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -8,6 +8,8 @@ on: name: test-coverage +permissions: read-all + jobs: test-coverage: runs-on: ubuntu-latest @@ -16,7 +18,7 @@ jobs: TORCH_INSTALL: 1 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -24,28 +26,37 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::covr + extra-packages: any::covr, any::xml2 needs: coverage - name: Test coverage run: | - covr::codecov( + cov <- covr::package_coverage( quiet = FALSE, clean = FALSE, - install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") + install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") ) + covr::to_cobertura(cov) shell: Rscript {0} + - uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }} + file: ./cobertura.xml + plugin: noop + disable_search: true + token: ${{ secrets.CODECOV_TOKEN }} + - name: Show testthat output if: always() run: | ## -------------------------------------------------------------------- - find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true + find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true shell: bash - name: Upload test results if: failure() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage-test-failures path: ${{ runner.temp }}/package diff --git a/DESCRIPTION b/DESCRIPTION index 7b8de96..d63d655 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,7 @@ Imports: rlang (>= 1.1.1), stats, tibble, - torch (>= 0.11.0), + torch (>= 0.13.0), utils Suggests: covr, @@ -40,4 +40,4 @@ Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 diff --git a/NAMESPACE b/NAMESPACE index b575d01..4f4a7fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,11 @@ S3method(brulee_mlp,default) S3method(brulee_mlp,formula) S3method(brulee_mlp,matrix) S3method(brulee_mlp,recipe) +S3method(brulee_mlp_two_layer,data.frame) +S3method(brulee_mlp_two_layer,default) +S3method(brulee_mlp_two_layer,formula) +S3method(brulee_mlp_two_layer,matrix) +S3method(brulee_mlp_two_layer,recipe) S3method(brulee_multinomial_reg,data.frame) S3method(brulee_multinomial_reg,default) S3method(brulee_multinomial_reg,formula) @@ -39,6 +44,7 @@ S3method(print,brulee_multinomial_reg) S3method(tunable,brulee_linear_reg) S3method(tunable,brulee_logistic_reg) S3method(tunable,brulee_mlp) +S3method(tunable,brulee_mlp_two_layer) S3method(tunable,brulee_multinomial_reg) export("%>%") export(autoplot) @@ -46,6 +52,7 @@ export(brulee_activations) export(brulee_linear_reg) export(brulee_logistic_reg) export(brulee_mlp) +export(brulee_mlp_two_layer) export(brulee_multinomial_reg) export(coef) export(matrix_to_dataset) diff --git a/NEWS.md b/NEWS.md index c605dab..db7a9b8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # brulee (development version) +* Added a convenience function, `brulee_mlp_two_layer()`, to more easily fit two-layer networks with parsnip. + # brulee 0.3.0 * Fixed bug where `coef()` didn't would error if used on a `brulee_logistic_reg()` that was trained with a recipe. (#66) diff --git a/R/mlp-fit.R b/R/mlp-fit.R index 77b0d58..537424b 100644 --- a/R/mlp-fit.R +++ b/R/mlp-fit.R @@ -1,7 +1,9 @@ #' Fit neural networks #' #' `brulee_mlp()` fits neural network models using stochastic gradient -#' descent. Multiple layers can be used. +#' descent. Multiple layers can be used. For working with two-layer networks in +#' tidymodels, `brulee_mlp_two_layer()` can be helpful for specifying tuning +#' parameters as scalars. #' #' @param x Depending on the context: #' @@ -33,11 +35,13 @@ #' @param hidden_units An integer for the number of hidden units, or a vector #' of integers. If a vector of integers, the model will have `length(hidden_units)` #' layers each with `hidden_units[i]` hidden units. +#' @param hidden_units_2 An integer for the number of hidden units for a second layer. #' @param activation A character vector for the activation function (such as #' "relu", "tanh", "sigmoid", and so on). See [brulee_activations()] for #' a list of possible values. If `hidden_units` is a vector, `activation` #' can be a character vector with length equals to `length(hidden_units)` #' specifying the activation for each hidden layer. +#' @param activation_2 A character vector for the activation function for a second layer. #' @param optimizer The method used in the optimization procedure. Possible choices #' are 'LBFGS' and 'SGD'. Default is 'LBFGS'. #' @param learn_rate A positive number that controls the initial rapidity that @@ -228,13 +232,13 @@ #' } #' @export brulee_mlp <- function(x, ...) { - UseMethod("brulee_mlp") + UseMethod("brulee_mlp") } #' @export #' @rdname brulee_mlp brulee_mlp.default <- function(x, ...) { - stop("`brulee_mlp()` is not defined for a '", class(x)[1], "'.", call. = FALSE) + stop("`brulee_mlp()` is not defined for a '", class(x)[1], "'.", call. = FALSE) } # XY method - data frame @@ -242,46 +246,46 @@ brulee_mlp.default <- function(x, ...) { #' @export #' @rdname brulee_mlp brulee_mlp.data.frame <- - function(x, - y, - epochs = 100L, - hidden_units = 3L, - activation = "relu", - penalty = 0.001, - mixture = 0, - dropout = 0, - validation = 0.1, - optimizer = "LBFGS", - learn_rate = 0.01, - rate_schedule = "none", - momentum = 0.0, - batch_size = NULL, - class_weights = NULL, - stop_iter = 5, - verbose = FALSE, - ...) { - processed <- hardhat::mold(x, y) - - brulee_mlp_bridge( - processed, - epochs = epochs, - hidden_units = hidden_units, - activation = activation, - learn_rate = learn_rate, - rate_schedule = rate_schedule, - penalty = penalty, - mixture = mixture, - dropout = dropout, - validation = validation, - optimizer = optimizer, - momentum = momentum, - batch_size = batch_size, - class_weights = class_weights, - stop_iter = stop_iter, - verbose = verbose, - ... - ) - } + function(x, + y, + epochs = 100L, + hidden_units = 3L, + activation = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + processed <- hardhat::mold(x, y) + + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units, + activation = activation, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + momentum = momentum, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) + } # XY method - matrix @@ -305,27 +309,27 @@ brulee_mlp.matrix <- function(x, stop_iter = 5, verbose = FALSE, ...) { - processed <- hardhat::mold(x, y) - - brulee_mlp_bridge( - processed, - epochs = epochs, - hidden_units = hidden_units, - activation = activation, - learn_rate = learn_rate, - rate_schedule = rate_schedule, - momentum = momentum, - penalty = penalty, - mixture = mixture, - dropout = dropout, - validation = validation, - optimizer = optimizer, - batch_size = batch_size, - class_weights = class_weights, - stop_iter = stop_iter, - verbose = verbose, - ... - ) + processed <- hardhat::mold(x, y) + + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units, + activation = activation, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + momentum = momentum, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) } # Formula method @@ -333,92 +337,92 @@ brulee_mlp.matrix <- function(x, #' @export #' @rdname brulee_mlp brulee_mlp.formula <- - function(formula, - data, - epochs = 100L, - hidden_units = 3L, - activation = "relu", - penalty = 0.001, - mixture = 0, - dropout = 0, - validation = 0.1, - optimizer = "LBFGS", - learn_rate = 0.01, - rate_schedule = "none", - momentum = 0.0, - batch_size = NULL, - class_weights = NULL, - stop_iter = 5, - verbose = FALSE, - ...) { - processed <- hardhat::mold(formula, data) - - brulee_mlp_bridge( - processed, - epochs = epochs, - hidden_units = hidden_units, - activation = activation, - learn_rate = learn_rate, - rate_schedule = rate_schedule, - momentum = momentum, - penalty = penalty, - mixture = mixture, - dropout = dropout, - validation = validation, - optimizer = optimizer, - batch_size = batch_size, - class_weights = class_weights, - stop_iter = stop_iter, - verbose = verbose, - ... - ) - } + function(formula, + data, + epochs = 100L, + hidden_units = 3L, + activation = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + processed <- hardhat::mold(formula, data) + + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units, + activation = activation, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + momentum = momentum, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) + } # Recipe method #' @export #' @rdname brulee_mlp brulee_mlp.recipe <- - function(x, - data, - epochs = 100L, - hidden_units = 3L, - activation = "relu", - penalty = 0.001, - mixture = 0, - dropout = 0, - validation = 0.1, - optimizer = "LBFGS", - learn_rate = 0.01, - rate_schedule = "none", - momentum = 0.0, - batch_size = NULL, - class_weights = NULL, - stop_iter = 5, - verbose = FALSE, - ...) { - processed <- hardhat::mold(x, data) - - brulee_mlp_bridge( - processed, - epochs = epochs, - hidden_units = hidden_units, - activation = activation, - learn_rate = learn_rate, - rate_schedule = rate_schedule, - momentum = momentum, - penalty = penalty, - mixture = mixture, - dropout = dropout, - validation = validation, - optimizer = optimizer, - batch_size = batch_size, - class_weights = class_weights, - stop_iter = stop_iter, - verbose = verbose, - ... - ) - } + function(x, + data, + epochs = 100L, + hidden_units = 3L, + activation = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + processed <- hardhat::mold(x, data) + + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units, + activation = activation, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + momentum = momentum, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) + } # ------------------------------------------------------------------------------ # Bridge @@ -427,445 +431,456 @@ brulee_mlp_bridge <- function(processed, epochs, hidden_units, activation, learn_rate, rate_schedule, momentum, penalty, mixture, dropout, class_weights, validation, optimizer, batch_size, stop_iter, verbose, ...) { - if(!torch::torch_is_installed()) { - cli::cli_abort("The torch backend has not been installed; use `torch::install_torch()`.") - } - - f_nm <- "brulee_mlp" - # check values of various argument values - if (is.numeric(epochs) & !is.integer(epochs)) { - epochs <- as.integer(epochs) - } - if (is.numeric(hidden_units) & !is.integer(hidden_units)) { - hidden_units <- as.integer(hidden_units) - } - if (length(hidden_units) > 1 && length(activation) == 1) { - activation <- rep(activation, length(hidden_units)) - } - if (length(hidden_units) != length(activation)) { - cli::cli_abort("'activation' must be a single value or a vector with the same length as 'hidden_units'") - } - - allowed_activation <- brulee_activations() - good_activation <- activation %in% allowed_activation - if (!all(good_activation)) { - cli::cli_abort(paste("'activation' should be one of: ", paste0(allowed_activation, collapse = ", "))) - } - - if (optimizer == "LBFGS" & !is.null(batch_size)) { - cli::cli_warn("'batch_size' is only used for the SGD optimizer.") - batch_size <- NULL - } + if(!torch::torch_is_installed()) { + cli::cli_abort("The torch backend has not been installed; use `torch::install_torch()`.") + } - if (!is.null(batch_size) & optimizer == "SGD") { - if (is.numeric(batch_size) & !is.integer(batch_size)) { - batch_size <- as.integer(batch_size) - } - check_integer(batch_size, single = TRUE, 1, fn = f_nm) - } + f_nm <- "brulee_mlp" + # check values of various argument values + if (is.numeric(epochs) & !is.integer(epochs)) { + epochs <- as.integer(epochs) + } + if (is.numeric(hidden_units) & !is.integer(hidden_units)) { + hidden_units <- as.integer(hidden_units) + } + if (length(hidden_units) > 1 && length(activation) == 1) { + activation <- rep(activation, length(hidden_units)) + } + if (length(hidden_units) != length(activation)) { + cli::cli_abort("'activation' must be a single value or a vector with the same length as 'hidden_units'") + } - check_integer(epochs, single = TRUE, 1, fn = f_nm) - check_integer(hidden_units, single = FALSE, 1, fn = f_nm) - check_double(penalty, single = TRUE, 0, incl = c(TRUE, TRUE), fn = f_nm) - check_double(mixture, single = TRUE, 0, 1, incl = c(TRUE, TRUE), fn = f_nm) - check_double(dropout, single = TRUE, 0, 1, incl = c(TRUE, FALSE), fn = f_nm) - check_double(validation, single = TRUE, 0, 1, incl = c(TRUE, FALSE), fn = f_nm) - check_double(momentum, single = TRUE, 0, 1, incl = c(TRUE, TRUE), fn = f_nm) - check_double(learn_rate, single = TRUE, 0, incl = c(FALSE, TRUE), fn = f_nm) - check_logical(verbose, single = TRUE, fn = f_nm) - check_character(activation, single = FALSE, fn = f_nm) + allowed_activation <- brulee_activations() + good_activation <- activation %in% allowed_activation + if (!all(good_activation)) { + cli::cli_abort(paste("'activation' should be one of: ", paste0(allowed_activation, collapse = ", "))) + } - ## ----------------------------------------------------------------------------- + if (optimizer == "LBFGS" & !is.null(batch_size)) { + cli::cli_warn("'batch_size' is only used for the SGD optimizer.") + batch_size <- NULL + } - predictors <- processed$predictors - - if (!is.matrix(predictors)) { - predictors <- as.matrix(predictors) - if (is.character(predictors)) { - cli::cli_abort( - paste( - "There were some non-numeric columns in the predictors.", - "Please use a formula or recipe to encode all of the predictors as numeric." - ) - ) - } + if (!is.null(batch_size) & optimizer == "SGD") { + if (is.numeric(batch_size) & !is.integer(batch_size)) { + batch_size <- as.integer(batch_size) } + check_integer(batch_size, single = TRUE, 1, fn = f_nm) + } - ## ----------------------------------------------------------------------------- - - outcome <- processed$outcomes[[1]] - - # ------------------------------------------------------------------------------ - - lvls <- levels(outcome) - xtab <- table(outcome) - class_weights <- check_class_weights(class_weights, lvls, xtab, f_nm) - - ## ----------------------------------------------------------------------------- - - fit <- - mlp_fit_imp( - x = predictors, - y = outcome, - epochs = epochs, - hidden_units = hidden_units, - activation = activation, - learn_rate = learn_rate, - rate_schedule = rate_schedule, - momentum = momentum, - penalty = penalty, - mixture = mixture, - dropout = dropout, - validation = validation, - optimizer = optimizer, - batch_size = batch_size, - class_weights = class_weights, - stop_iter = stop_iter, - verbose = verbose, - ... + check_integer(epochs, single = TRUE, 1, fn = f_nm) + check_integer(hidden_units, single = FALSE, 1, fn = f_nm) + check_double(penalty, single = TRUE, 0, incl = c(TRUE, TRUE), fn = f_nm) + check_double(mixture, single = TRUE, 0, 1, incl = c(TRUE, TRUE), fn = f_nm) + check_double(dropout, single = TRUE, 0, 1, incl = c(TRUE, FALSE), fn = f_nm) + check_double(validation, single = TRUE, 0, 1, incl = c(TRUE, FALSE), fn = f_nm) + check_double(momentum, single = TRUE, 0, 1, incl = c(TRUE, TRUE), fn = f_nm) + check_double(learn_rate, single = TRUE, 0, incl = c(FALSE, TRUE), fn = f_nm) + check_logical(verbose, single = TRUE, fn = f_nm) + check_character(activation, single = FALSE, fn = f_nm) + + ## ----------------------------------------------------------------------------- + + predictors <- processed$predictors + + if (!is.matrix(predictors)) { + predictors <- as.matrix(predictors) + if (is.character(predictors)) { + cli::cli_abort( + paste( + "There were some non-numeric columns in the predictors.", + "Please use a formula or recipe to encode all of the predictors as numeric." ) + ) + } + } - new_brulee_mlp( - model_obj = fit$model_obj, - estimates = fit$estimates, - best_epoch = fit$best_epoch, - loss = fit$loss, - dims = fit$dims, - y_stats = fit$y_stats, - parameters = fit$parameters, - blueprint = processed$blueprint + ## ----------------------------------------------------------------------------- + + outcome <- processed$outcomes[[1]] + + # ------------------------------------------------------------------------------ + + lvls <- levels(outcome) + xtab <- table(outcome) + class_weights <- check_class_weights(class_weights, lvls, xtab, f_nm) + + ## ----------------------------------------------------------------------------- + + fit <- + mlp_fit_imp( + x = predictors, + y = outcome, + epochs = epochs, + hidden_units = hidden_units, + activation = activation, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + momentum = momentum, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... ) + + new_brulee_mlp( + model_obj = fit$model_obj, + estimates = fit$estimates, + best_epoch = fit$best_epoch, + loss = fit$loss, + dims = fit$dims, + y_stats = fit$y_stats, + parameters = fit$parameters, + blueprint = processed$blueprint + ) } new_brulee_mlp <- function( model_obj, estimates, best_epoch, loss, dims, y_stats, parameters, blueprint) { - if (!inherits(model_obj, "raw")) { - cli::cli_abort("'model_obj' should be a raw vector.") - } - if (!is.list(estimates)) { - cli::cli_abort("'parameters' should be a list") - } - if (!is.vector(best_epoch) || !is.integer(best_epoch)) { - cli::cli_abort("'best_epoch' should be an integer") - } - if (!is.vector(loss) || !is.numeric(loss)) { - cli::cli_abort("'loss' should be a numeric vector") - } - if (!is.list(dims)) { - cli::cli_abort("'dims' should be a list") - } - if (!is.list(y_stats)) { - cli::cli_abort("'y_stats' should be a list") - } - if (!is.list(parameters)) { - cli::cli_abort("'parameters' should be a list") - } - if (!inherits(blueprint, "hardhat_blueprint")) { - cli::cli_abort("'blueprint' should be a hardhat blueprint") - } - hardhat::new_model(model_obj = model_obj, - estimates = estimates, - best_epoch = best_epoch, - loss = loss, - dims = dims, - y_stats = y_stats, - parameters = parameters, - blueprint = blueprint, - class = "brulee_mlp") + if (!inherits(model_obj, "raw")) { + cli::cli_abort("'model_obj' should be a raw vector.") + } + if (!is.list(estimates)) { + cli::cli_abort("'parameters' should be a list") + } + if (!is.vector(best_epoch) || !is.integer(best_epoch)) { + cli::cli_abort("'best_epoch' should be an integer") + } + if (!is.vector(loss) || !is.numeric(loss)) { + cli::cli_abort("'loss' should be a numeric vector") + } + if (!is.list(dims)) { + cli::cli_abort("'dims' should be a list") + } + if (!is.list(y_stats)) { + cli::cli_abort("'y_stats' should be a list") + } + if (!is.list(parameters)) { + cli::cli_abort("'parameters' should be a list") + } + if (!inherits(blueprint, "hardhat_blueprint")) { + cli::cli_abort("'blueprint' should be a hardhat blueprint") + } + hardhat::new_model(model_obj = model_obj, + estimates = estimates, + best_epoch = best_epoch, + loss = loss, + dims = dims, + y_stats = y_stats, + parameters = parameters, + blueprint = blueprint, + class = "brulee_mlp") } ## ----------------------------------------------------------------------------- # Fit code mlp_fit_imp <- - function(x, y, - epochs = 100L, - batch_size = 32, - hidden_units = 3L, - penalty = 0.001, - mixture = 0, - dropout = 0, - validation = 0.1, - optimizer = "LBFGS", - learn_rate = 0.01, - rate_schedule = "none", - momentum = 0.0, - activation = "relu", - class_weights = NULL, - stop_iter = 5, - verbose = FALSE, - ...) { - - torch::torch_manual_seed(sample.int(10^5, 1)) - - ## --------------------------------------------------------------------------- - # General data checks: - - check_data_att(x, y) - - # Check missing values - compl_data <- check_missing_data(x, y, "brulee_mlp", verbose) - x <- compl_data$x - y <- compl_data$y - n <- length(y) - p <- ncol(x) - - if (is.factor(y)) { - lvls <- levels(y) - y_dim <- length(lvls) - # the model will output softmax values. - # so we need to use negative likelihood loss and - # pass the log of softmax. - loss_fn <- function(input, target, wts = NULL) { - nnf_nll_loss( - weight = wts, - input = torch::torch_log(input), - target = target - ) - } - } else { - y_dim <- 1 - lvls <- NULL - loss_fn <- function(input, target, wts = NULL) { - nnf_mse_loss(input, target$view(c(-1,1))) - } - } + function(x, y, + epochs = 100L, + batch_size = 32, + hidden_units = 3L, + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + activation = "relu", + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + + torch::torch_manual_seed(sample.int(10^5, 1)) + + ## --------------------------------------------------------------------------- + # General data checks: + + check_data_att(x, y) + + # Check missing values + compl_data <- check_missing_data(x, y, "brulee_mlp", verbose) + x <- compl_data$x + y <- compl_data$y + n <- length(y) + p <- ncol(x) + + if (is.factor(y)) { + lvls <- levels(y) + y_dim <- length(lvls) + # the model will output softmax values. + # so we need to use negative likelihood loss and + # pass the log of softmax. + loss_fn <- function(input, target, wts = NULL) { + nnf_nll_loss( + weight = wts, + input = torch::torch_log(input), + target = target + ) + } + } else { + y_dim <- 1 + lvls <- NULL + loss_fn <- function(input, target, wts = NULL) { + nnf_mse_loss(input, target$view(c(-1,1))) + } + } - if (validation > 0) { - in_val <- sample(seq_along(y), floor(n * validation)) - x_val <- x[in_val,, drop = FALSE] - y_val <- y[in_val] - x <- x[-in_val,, drop = FALSE] - y <- y[-in_val] - } + if (validation > 0) { + in_val <- sample(seq_along(y), floor(n * validation)) + x_val <- x[in_val,, drop = FALSE] + y_val <- y[in_val] + x <- x[-in_val,, drop = FALSE] + y <- y[-in_val] + } - if (!is.factor(y)) { - y_stats <- scale_stats(y) - y <- scale_y(y, y_stats) - if (validation > 0) { - y_val <- scale_y(y_val, y_stats) - } - loss_label <- "\tLoss (scaled):" - } else { - y_stats <- list(mean = NA_real_, sd = NA_real_) - loss_label <- "\tLoss:" - } + if (!is.factor(y)) { + y_stats <- scale_stats(y) + y <- scale_y(y, y_stats) + if (validation > 0) { + y_val <- scale_y(y_val, y_stats) + } + loss_label <- "\tLoss (scaled):" + } else { + y_stats <- list(mean = NA_real_, sd = NA_real_) + loss_label <- "\tLoss:" + } - if (is.null(batch_size) & optimizer == "SGD") { - batch_size <- nrow(x) - } else { - batch_size <- min(batch_size, nrow(x)) - } + if (is.null(batch_size) & optimizer == "SGD") { + batch_size <- nrow(x) + } else { + batch_size <- min(batch_size, nrow(x)) + } - ## --------------------------------------------------------------------------- - # Convert to index sampler and data loader - ds <- brulee::matrix_to_dataset(x, y) - dl <- torch::dataloader(ds, batch_size = batch_size) + ## --------------------------------------------------------------------------- + # Convert to index sampler and data loader + ds <- brulee::matrix_to_dataset(x, y) + dl <- torch::dataloader(ds, batch_size = batch_size) - if (validation > 0) { - ds_val <- brulee::matrix_to_dataset(x_val, y_val) - dl_val <- torch::dataloader(ds_val) - } + if (validation > 0) { + ds_val <- brulee::matrix_to_dataset(x_val, y_val) + dl_val <- torch::dataloader(ds_val) + } - ## --------------------------------------------------------------------------- - # Initialize model and optimizer - model <- mlp_module(ncol(x), hidden_units, activation, dropout, y_dim) - loss_fn <- make_penalized_loss(loss_fn, model, penalty, mixture) + ## --------------------------------------------------------------------------- + # Initialize model and optimizer + model <- mlp_module(ncol(x), hidden_units, activation, dropout, y_dim) + loss_fn <- make_penalized_loss(loss_fn, model, penalty, mixture) - # Set the optimizer (will be set again below) - optimizer_obj <- set_optimizer(optimizer, model, learn_rate, momentum) + # Set the optimizer (will be set again below) + optimizer_obj <- set_optimizer(optimizer, model, learn_rate, momentum) - ## --------------------------------------------------------------------------- + ## --------------------------------------------------------------------------- - loss_prev <- 10^38 - loss_min <- loss_prev - poor_epoch <- 0 - best_epoch <- 1 - loss_vec <- rep(NA_real_, epochs) - if (verbose) { - epoch_chr <- format(1:epochs) - } + loss_prev <- 10^38 + loss_min <- loss_prev + poor_epoch <- 0 + best_epoch <- 1 + loss_vec <- rep(NA_real_, epochs) + if (verbose) { + epoch_chr <- format(1:epochs) + } - ## ----------------------------------------------------------------------------- - - param_per_epoch <- list() - - # Optimize parameters - for (epoch in 1:epochs) { - - # For future work with other optimizers, see - # https://github.com/tidymodels/brulee/pull/56#discussion_r972049108 - # "Creating a new optimizer every epoch will reset the optimizer state. - # For example, SGD with momentum keeps track of the latest update for each - # parameter, so it might be OK to just restart. - # But other optimizers like Adam, will keep a moving average of updates and - # resetting them can interfere in training." - - learn_rate <- set_learn_rate(epoch - 1, learn_rate, type = rate_schedule, ...) - optimizer_obj <- set_optimizer(optimizer, model, learn_rate, momentum) - - # training loop - coro::loop( - for (batch in dl) { - cl <- function() { - optimizer_obj$zero_grad() - pred <- model(batch$x) - loss <- loss_fn(pred, batch$y, class_weights) - loss$backward() - loss - } - optimizer_obj$step(cl) - } - ) - - # calculate loss on the full datasets - if (validation > 0) { - pred <- model(dl_val$dataset$tensors$x) - loss <- loss_fn(pred, dl_val$dataset$tensors$y, class_weights) - } else { - pred <- model(dl$dataset$tensors$x) - loss <- loss_fn(pred, dl$dataset$tensors$y, class_weights) - } - - # calculate losses - loss_curr <- loss$item() - loss_vec[epoch] <- loss_curr - - if (is.nan(loss_curr)) { - cli::cli_warn("Current loss in NaN. Training wil be stopped.") - break() - } - - if (loss_curr >= loss_min) { - poor_epoch <- poor_epoch + 1 - loss_note <- paste0(" ", cli::symbol$cross, " ") - } else { - loss_min <- loss_curr - loss_note <- NULL - poor_epoch <- 0 - best_epoch <- epoch - } - loss_prev <- loss_curr - - # persists models and coefficients - param_per_epoch[[epoch]] <- - lapply(model$state_dict(), function(x) torch::as_array(x$cpu())) - - if (verbose) { - msg <- paste("epoch:", epoch_chr[epoch], "learn rate", signif(learn_rate, 3), - loss_label, signif(loss_curr, 3), loss_note) - - cli::cli_inform(msg) - } - - if (poor_epoch == stop_iter) { - break() - } + ## ----------------------------------------------------------------------------- + param_per_epoch <- list() + + # Optimize parameters + for (epoch in 1:epochs) { + + # For future work with other optimizers, see + # https://github.com/tidymodels/brulee/pull/56#discussion_r972049108 + # "Creating a new optimizer every epoch will reset the optimizer state. + # For example, SGD with momentum keeps track of the latest update for each + # parameter, so it might be OK to just restart. + # But other optimizers like Adam, will keep a moving average of updates and + # resetting them can interfere in training." + + learn_rate <- set_learn_rate(epoch - 1, learn_rate, type = rate_schedule, ...) + optimizer_obj <- set_optimizer(optimizer, model, learn_rate, momentum) + + # training loop + coro::loop( + for (batch in dl) { + cl <- function() { + optimizer_obj$zero_grad() + pred <- model(batch$x) + loss <- loss_fn(pred, batch$y, class_weights) + loss$backward() + loss + } + optimizer_obj$step(cl) } + ) + + # calculate loss on the full datasets + if (validation > 0) { + pred <- model(dl_val$dataset$tensors$x) + loss <- loss_fn(pred, dl_val$dataset$tensors$y, class_weights) + } else { + pred <- model(dl$dataset$tensors$x) + loss <- loss_fn(pred, dl$dataset$tensors$y, class_weights) + } + + # calculate losses + loss_curr <- loss$item() + loss_vec[epoch] <- loss_curr + + if (is.nan(loss_curr)) { + cli::cli_warn("Loss is NaN at epoch {epoch}. Training is stopped.") + break() + } + + if (loss_curr >= loss_min) { + poor_epoch <- poor_epoch + 1 + loss_note <- paste0(" ", cli::symbol$cross, " ") + } else { + loss_min <- loss_curr + loss_note <- NULL + poor_epoch <- 0 + best_epoch <- epoch + } + loss_prev <- loss_curr + + # persists models and coefficients + param_per_epoch[[epoch]] <- + lapply(model$state_dict(), function(x) torch::as_array(x$cpu())) + + if (verbose) { + msg <- paste("epoch:", epoch_chr[epoch], "learn rate", signif(learn_rate, 3), + loss_label, signif(loss_curr, 3), loss_note) + + cli::cli_inform(msg) + } + + if (poor_epoch == stop_iter) { + break() + } - # ------------------------------------------------------------------------------ - - class_weights <- as.numeric(class_weights) - names(class_weights) <- lvls - - ## --------------------------------------------------------------------------- - - list( - model_obj = model_to_raw(model), - estimates = param_per_epoch, - loss = loss_vec[1:length(param_per_epoch)], - best_epoch = best_epoch, - dims = list(p = p, n = n, h = hidden_units, y = y_dim, levels = lvls, features = colnames(x)), - y_stats = y_stats, - parameters = list( - activation = activation, - hidden_units = hidden_units, - learn_rate = learn_rate, - class_weights = class_weights, - penalty = penalty, - mixture = mixture, - dropout = dropout, - validation = validation, - optimizer = optimizer, - batch_size = batch_size, - momentum = momentum, - sched = rate_schedule, - sched_opt = list(...) - ) - ) } + # ------------------------------------------------------------------------------ + + class_weights <- as.numeric(class_weights) + names(class_weights) <- lvls + + ## --------------------------------------------------------------------------- + + list( + model_obj = model_to_raw(model), + estimates = param_per_epoch, + loss = loss_vec[1:length(param_per_epoch)], + best_epoch = best_epoch, + dims = list(p = p, n = n, h = hidden_units, y = y_dim, levels = lvls, + features = colnames(x)), + y_stats = y_stats, + parameters = list( + activation = activation, + hidden_units = hidden_units, + learn_rate = learn_rate, + class_weights = class_weights, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + momentum = momentum, + sched = rate_schedule, + sched_opt = list(...) + ) + ) + } + mlp_module <- - torch::nn_module( - "mlp_module", - initialize = function(num_pred, hidden_units, act_type, dropout, y_dim) { - - layers <- list() - - # input layer - layers[[1]] <- torch::nn_linear(num_pred, hidden_units[1]) - layers[[2]] <- get_activation_fn(act_type[1]) - - # if hidden units is a vector then we add those layers - if (length(hidden_units) > 1) { - for (i in 2:length(hidden_units)) { - layers[[length(layers) + 1]] <- - torch::nn_linear(hidden_units[i-1], hidden_units[i]) - - layers[[length(layers) + 1]] <- get_activation_fn(act_type[i]) - } - } - - # we only add dropout between the last layer and the output layer - if (dropout > 0) { - layers[[length(layers) + 1]] <- torch::nn_dropout(p = dropout) - } - - # output layer - layers[[length(layers) + 1]] <- - torch::nn_linear(hidden_units[length(hidden_units)], y_dim) - - # conditionally add the softmax layer - if (y_dim > 1) { - layers[[length(layers) + 1]] <- torch::nn_softmax(dim = 2) - } - - # create a sequential module that calls the layers in the same order. - self$model <- torch::nn_sequential(!!!layers) - - }, - forward = function(x) { - self$model(x) + torch::nn_module( + "mlp_module", + initialize = function(num_pred, hidden_units, act_type, dropout, y_dim) { + + layers <- list() + + # input layer + layers[[1]] <- torch::nn_linear(num_pred, hidden_units[1]) + layers[[2]] <- get_activation_fn(act_type[1]) + + # if hidden units is a vector then we add those layers + if (length(hidden_units) > 1) { + for (i in 2:length(hidden_units)) { + layers[[length(layers) + 1]] <- + torch::nn_linear(hidden_units[i-1], hidden_units[i]) + layers[[length(layers) + 1]] <- get_activation_fn(act_type[i]) } - ) + } + + # we only add dropout between the last layer and the output layer + if (dropout > 0) { + layers[[length(layers) + 1]] <- torch::nn_dropout(p = dropout) + } + + # output layer + layers[[length(layers) + 1]] <- + torch::nn_linear(hidden_units[length(hidden_units)], y_dim) + + # conditionally add the softmax layer + if (y_dim > 1) { + layers[[length(layers) + 1]] <- torch::nn_softmax(dim = 2) + } + + # create a sequential module that calls the layers in the same order. + self$model <- torch::nn_sequential(!!!layers) + + }, + forward = function(x) { + self$model(x) + } + ) ## ----------------------------------------------------------------------------- get_num_mlp_coef <- function(x) { - length(unlist(x$estimates[[1]])) + length(unlist(x$estimates[[1]])) } get_units<- function(x) { - if (length(x$dims$h) > 1) { - res <- paste0("c(", paste(x$dims$h, collapse = ","), ") hidden units,") - } else { - res <- paste(format(x$dims$h, big.mark = ","), "hidden units,") - } - res + if (length(x$dims$h) > 1) { + res <- paste0("c(", paste(x$dims$h, collapse = ","), ") hidden units,") + } else { + res <- paste(format(x$dims$h, big.mark = ","), "hidden units,") + } + res +} + +get_acts <- function(x) { + if (length(x$dims$h) > 1) { + res <- paste0("c(", paste(x$parameters$activation, collapse = ","), ") activation,") + } else { + res <- paste(x$parameters$activation, "activation,") + } + res } + #' @export print.brulee_mlp <- function(x, ...) { - cat("Multilayer perceptron\n\n") - cat(x$param$activation, "activation\n") - cat( - get_units(x), "", - format(get_num_mlp_coef(x), big.mark = ","), "model parameters\n" - ) - brulee_print(x, ...) + cat("Multilayer perceptron\n\n") + cat( + get_acts(x), "\n", + get_units(x), "\n", + format(get_num_mlp_coef(x), big.mark = ","), " model parameters\n", + sep = "" + ) + brulee_print(x, ...) } ## ----------------------------------------------------------------------------- @@ -880,3 +895,238 @@ set_optimizer <- function(optimizer, model, learn_rate, momentum) { } res } + +init_layer <- function(layer, act) { + gain_for_rng <- torch::nn_init_calculate_gain(act) + torch::nn_init_xavier_normal_(layer, gain_for_rng) +} + +# ------------------------------------------------------------------------------ + +#' @export +#' @rdname brulee_mlp +brulee_mlp_two_layer <- function(x, ...) { + UseMethod("brulee_mlp_two_layer") +} + +#' @export +#' @rdname brulee_mlp +brulee_mlp_two_layer.default <- function(x, ...) { + stop("`brulee_mlp_two_layer()` is not defined for a '", class(x)[1], "'.", call. = FALSE) +} + +# XY method - data frame + +#' @export +#' @rdname brulee_mlp +brulee_mlp_two_layer.data.frame <- + function(x, + y, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + processed <- hardhat::mold(x, y) + + hidden_units_all <- c(hidden_units, hidden_units_2) + activation_all <- c(activation, activation_2) + + res <- + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units_all, + activation = activation_all, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + momentum = momentum, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) + class(res) <- c("brulee_mlp_two_layer", class(res)) + res + } + +# XY method - matrix + +#' @export +#' @rdname brulee_mlp +brulee_mlp_two_layer.matrix <- function(x, + y, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + processed <- hardhat::mold(x, y) + + hidden_units_all <- c(hidden_units, hidden_units_2) + activation_all <- c(activation, activation_2) + + res <- + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units_all, + activation = activation_all, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + momentum = momentum, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) + class(res) <- c("brulee_mlp_two_layer", class(res)) + res +} + +# Formula method + +#' @export +#' @rdname brulee_mlp +brulee_mlp_two_layer.formula <- + function(formula, + data, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + processed <- hardhat::mold(formula, data) + + hidden_units_all <- c(hidden_units, hidden_units_2) + activation_all <- c(activation, activation_2) + + res <- + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units_all, + activation = activation_all, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + momentum = momentum, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) + class(res) <- c("brulee_mlp_two_layer", class(res)) + res + } + +# Recipe method + +#' @export +#' @rdname brulee_mlp +brulee_mlp_two_layer.recipe <- + function(x, + data, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0.0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ...) { + processed <- hardhat::mold(x, data) + + hidden_units_all <- c(hidden_units, hidden_units_2) + activation_all <- c(activation, activation_2) + + res <- + brulee_mlp_bridge( + processed, + epochs = epochs, + hidden_units = hidden_units_all, + activation = activation_all, + learn_rate = learn_rate, + rate_schedule = rate_schedule, + momentum = momentum, + penalty = penalty, + mixture = mixture, + dropout = dropout, + validation = validation, + optimizer = optimizer, + batch_size = batch_size, + class_weights = class_weights, + stop_iter = stop_iter, + verbose = verbose, + ... + ) + class(res) <- c("brulee_mlp_two_layer", class(res)) + res + } + diff --git a/R/tunable.R b/R/tunable.R index 78eb3b8..367f2ce 100644 --- a/R/tunable.R +++ b/R/tunable.R @@ -1,3 +1,5 @@ +tune_activations <- c("relu", "tanh", "elu", "log_sigmoid", "tanhshrink") + #' Internal functions and methods #' @export #' @keywords internal @@ -9,7 +11,7 @@ tunable.brulee_mlp <- function(x, ...) { call_info = list( list(pkg = "dials", fun = "epochs", range = c(5L, 500L)), list(pkg = "dials", fun = "hidden_units", range = c(2L, 50L)), - list(pkg = "dials", fun = "activation"), + list(pkg = "dials", fun = "activation", values = tune_activations), list(pkg = "dials", fun = "penalty"), list(pkg = "dials", fun = "dropout"), list(pkg = "dials", fun = "learn_rate", range = c(-3, -1/5)), @@ -24,6 +26,35 @@ tunable.brulee_mlp <- function(x, ...) { ) } +#' @export +#' @keywords internal +#' @name brulee-internal +tunable.brulee_mlp_two_layer <- function(x, ...) { + tibble::tibble( + name = c('epochs', 'hidden_units', 'hidden_units_2', 'activation', 'activation_2', + 'penalty', 'dropout', 'learn_rate', 'momentum', 'batch_size', + 'class_weights', 'stop_iter'), + call_info = list( + list(pkg = "dials", fun = "epochs", range = c(5L, 500L)), + list(pkg = "dials", fun = "hidden_units", range = c(2L, 50L)), + list(pkg = "dials", fun = "hidden_units", range = c(2L, 50L)), + list(pkg = "dials", fun = "activation", values = tune_activations), + list(pkg = "dials", fun = "activation", values = tune_activations), + list(pkg = "dials", fun = "penalty"), + list(pkg = "dials", fun = "dropout"), + list(pkg = "dials", fun = "learn_rate", range = c(-3, -1/5)), + list(pkg = "dials", fun = "momentum", range = c(0.50, 0.95)), + list(pkg = "dials", fun = "batch_size"), + list(pkg = "dials", fun = "stop_iter"), + list(pkg = "dials", fun = "class_weights") + ), + source = "model_spec", + component = class(x)[class(x) != "model_spec"][1], + component_id = "main" + ) +} + + #' @export #' @keywords internal #' @rdname brulee-internal diff --git a/inst/WORDLIST b/inst/WORDLIST index 56eb8d0..c5851ad 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,7 +5,6 @@ Lifecycle ORCID PBC SGD -elu extensibility funder mlp @@ -15,3 +14,4 @@ relu sigmoid tanh tibble +tidymodels diff --git a/man/brulee-internal.Rd b/man/brulee-internal.Rd index 489345e..b2b831d 100644 --- a/man/brulee-internal.Rd +++ b/man/brulee-internal.Rd @@ -3,6 +3,7 @@ \name{brulee-internal} \alias{brulee-internal} \alias{tunable.brulee_mlp} +\alias{tunable.brulee_mlp_two_layer} \alias{tunable.brulee_logistic_reg} \alias{tunable.brulee_multinomial_reg} \alias{tunable.brulee_linear_reg} @@ -10,6 +11,8 @@ \usage{ \method{tunable}{brulee_mlp}(x, ...) +\method{tunable}{brulee_mlp_two_layer}(x, ...) + \method{tunable}{brulee_logistic_reg}(x, ...) \method{tunable}{brulee_multinomial_reg}(x, ...) diff --git a/man/brulee_mlp.Rd b/man/brulee_mlp.Rd index 83b3440..573a68d 100644 --- a/man/brulee_mlp.Rd +++ b/man/brulee_mlp.Rd @@ -7,6 +7,12 @@ \alias{brulee_mlp.matrix} \alias{brulee_mlp.formula} \alias{brulee_mlp.recipe} +\alias{brulee_mlp_two_layer} +\alias{brulee_mlp_two_layer.default} +\alias{brulee_mlp_two_layer.data.frame} +\alias{brulee_mlp_two_layer.matrix} +\alias{brulee_mlp_two_layer.formula} +\alias{brulee_mlp_two_layer.recipe} \title{Fit neural networks} \usage{ brulee_mlp(x, ...) @@ -96,6 +102,102 @@ brulee_mlp(x, ...) verbose = FALSE, ... ) + +brulee_mlp_two_layer(x, ...) + +\method{brulee_mlp_two_layer}{default}(x, ...) + +\method{brulee_mlp_two_layer}{data.frame}( + x, + y, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ... +) + +\method{brulee_mlp_two_layer}{matrix}( + x, + y, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ... +) + +\method{brulee_mlp_two_layer}{formula}( + formula, + data, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ... +) + +\method{brulee_mlp_two_layer}{recipe}( + x, + data, + epochs = 100L, + hidden_units = 3L, + hidden_units_2 = 3L, + activation = "relu", + activation_2 = "relu", + penalty = 0.001, + mixture = 0, + dropout = 0, + validation = 0.1, + optimizer = "LBFGS", + learn_rate = 0.01, + rate_schedule = "none", + momentum = 0, + batch_size = NULL, + class_weights = NULL, + stop_iter = 5, + verbose = FALSE, + ... +) } \arguments{ \item{x}{Depending on the context: @@ -126,7 +228,7 @@ specified as: of integers. If a vector of integers, the model will have \code{length(hidden_units)} layers each with \code{hidden_units[i]} hidden units.} -\item{activation}{A character vector for the activation function )such as +\item{activation}{A character vector for the activation function (such as "relu", "tanh", "sigmoid", and so on). See \code{\link[=brulee_activations]{brulee_activations()}} for a list of possible values. If \code{hidden_units} is a vector, \code{activation} can be a character vector with length equals to \code{length(hidden_units)} @@ -184,6 +286,10 @@ and the predictor term(s) on the right-hand side.} \itemize{ \item A \strong{data frame} containing both the predictors and the outcome. }} + +\item{hidden_units_2}{An integer for the number of hidden units for a second layer.} + +\item{activation_2}{A character vector for the activation function for a second layer.} } \value{ A \code{brulee_mlp} object with elements: @@ -202,7 +308,9 @@ likelihood for classification) at each epoch. } \description{ \code{brulee_mlp()} fits neural network models using stochastic gradient -descent. Multiple layers can be used. +descent. Multiple layers can be used. For working with two-layer networks in +tidymodels, \code{brulee_mlp_two_layer()} can be helpful for specifying tuning +parameters as scalars. } \details{ This function fits feed-forward neural network models for regression (when @@ -303,6 +411,15 @@ if (torch::torch_is_installed()) { bind_cols(ames_test) \%>\% rmse(Sale_Price, .pred) + # Using multiple hidden layers and activation functions + set.seed(2) + hidden_fit <- brulee_mlp(ames_rec, data = ames_train, + hidden_units = c(15L, 17L), activation = c("relu", "elu"), + dropout = 0.05, rate_schedule = "cyclic", step_size = 4) + + predict(hidden_fit, ames_test) \%>\% + bind_cols(ames_test) \%>\% + rmse(Sale_Price, .pred) # ------------------------------------------------------------------------------ # classification diff --git a/tests/testthat/_snaps/overflow.md b/tests/testthat/_snaps/overflow.md new file mode 100644 index 0000000..2e6b895 --- /dev/null +++ b/tests/testthat/_snaps/overflow.md @@ -0,0 +1,22 @@ +# NaN loss due to overflow + + Loss is NaN at epoch 10. Training is stopped. + +--- + + Code + print(mlp_fit) + Output + Multilayer perceptron + + relu activation, + 10 hidden units, + 52 model parameters + 200 samples, 2 features, 2 classes + class weights one=1, two=1 + weight decay: 0.001 + dropout proportion: 0 + batch size: 180 + learn rate: 0.01 + validation loss after 9 epochs: 0.469 + diff --git a/tests/testthat/_snaps/tunable.md b/tests/testthat/_snaps/tunable.md new file mode 100644 index 0000000..0f3d5ad --- /dev/null +++ b/tests/testthat/_snaps/tunable.md @@ -0,0 +1,429 @@ +# tunable values + + Code + brulee:::tunable.brulee_linear_reg(1)$call_info + Output + [[1]] + [[1]]$pkg + [1] "dials" + + [[1]]$fun + [1] "epochs" + + [[1]]$range + [1] 5 100 + + + [[2]] + [[2]]$pkg + [1] "dials" + + [[2]]$fun + [1] "penalty" + + + [[3]] + [[3]]$pkg + [1] "dials" + + [[3]]$fun + [1] "learn_rate" + + [[3]]$range + [1] -3.0 -0.2 + + + [[4]] + [[4]]$pkg + [1] "dials" + + [[4]]$fun + [1] "momentum" + + [[4]]$range + [1] 0.50 0.95 + + + [[5]] + [[5]]$pkg + [1] "dials" + + [[5]]$fun + [1] "batch_size" + + + [[6]] + [[6]]$pkg + [1] "dials" + + [[6]]$fun + [1] "stop_iter" + + + +--- + + Code + brulee:::tunable.brulee_logistic_reg(1)$call_info + Output + [[1]] + [[1]]$pkg + [1] "dials" + + [[1]]$fun + [1] "epochs" + + [[1]]$range + [1] 5 100 + + + [[2]] + [[2]]$pkg + [1] "dials" + + [[2]]$fun + [1] "penalty" + + + [[3]] + [[3]]$pkg + [1] "dials" + + [[3]]$fun + [1] "learn_rate" + + [[3]]$range + [1] -3.0 -0.2 + + + [[4]] + [[4]]$pkg + [1] "dials" + + [[4]]$fun + [1] "momentum" + + [[4]]$range + [1] 0.50 0.95 + + + [[5]] + [[5]]$pkg + [1] "dials" + + [[5]]$fun + [1] "batch_size" + + + [[6]] + [[6]]$pkg + [1] "dials" + + [[6]]$fun + [1] "stop_iter" + + + [[7]] + [[7]]$pkg + [1] "dials" + + [[7]]$fun + [1] "class_weights" + + + +--- + + Code + brulee:::tunable.brulee_multinomial_reg(1)$call_info + Output + [[1]] + [[1]]$pkg + [1] "dials" + + [[1]]$fun + [1] "epochs" + + [[1]]$range + [1] 5 100 + + + [[2]] + [[2]]$pkg + [1] "dials" + + [[2]]$fun + [1] "penalty" + + + [[3]] + [[3]]$pkg + [1] "dials" + + [[3]]$fun + [1] "learn_rate" + + [[3]]$range + [1] -3.0 -0.2 + + + [[4]] + [[4]]$pkg + [1] "dials" + + [[4]]$fun + [1] "momentum" + + [[4]]$range + [1] 0.50 0.95 + + + [[5]] + [[5]]$pkg + [1] "dials" + + [[5]]$fun + [1] "batch_size" + + + [[6]] + [[6]]$pkg + [1] "dials" + + [[6]]$fun + [1] "stop_iter" + + + [[7]] + [[7]]$pkg + [1] "dials" + + [[7]]$fun + [1] "class_weights" + + + +--- + + Code + brulee:::tunable.brulee_mlp(1)$call_info + Output + [[1]] + [[1]]$pkg + [1] "dials" + + [[1]]$fun + [1] "epochs" + + [[1]]$range + [1] 5 500 + + + [[2]] + [[2]]$pkg + [1] "dials" + + [[2]]$fun + [1] "hidden_units" + + [[2]]$range + [1] 2 50 + + + [[3]] + [[3]]$pkg + [1] "dials" + + [[3]]$fun + [1] "activation" + + [[3]]$values + [1] "relu" "tanh" "elu" "log_sigmoid" "tanhshrink" + + + [[4]] + [[4]]$pkg + [1] "dials" + + [[4]]$fun + [1] "penalty" + + + [[5]] + [[5]]$pkg + [1] "dials" + + [[5]]$fun + [1] "dropout" + + + [[6]] + [[6]]$pkg + [1] "dials" + + [[6]]$fun + [1] "learn_rate" + + [[6]]$range + [1] -3.0 -0.2 + + + [[7]] + [[7]]$pkg + [1] "dials" + + [[7]]$fun + [1] "momentum" + + [[7]]$range + [1] 0.50 0.95 + + + [[8]] + [[8]]$pkg + [1] "dials" + + [[8]]$fun + [1] "batch_size" + + + [[9]] + [[9]]$pkg + [1] "dials" + + [[9]]$fun + [1] "stop_iter" + + + [[10]] + [[10]]$pkg + [1] "dials" + + [[10]]$fun + [1] "class_weights" + + + +--- + + Code + brulee:::tunable.brulee_mlp_two_layer(1)$call_info + Output + [[1]] + [[1]]$pkg + [1] "dials" + + [[1]]$fun + [1] "epochs" + + [[1]]$range + [1] 5 500 + + + [[2]] + [[2]]$pkg + [1] "dials" + + [[2]]$fun + [1] "hidden_units" + + [[2]]$range + [1] 2 50 + + + [[3]] + [[3]]$pkg + [1] "dials" + + [[3]]$fun + [1] "hidden_units" + + [[3]]$range + [1] 2 50 + + + [[4]] + [[4]]$pkg + [1] "dials" + + [[4]]$fun + [1] "activation" + + [[4]]$values + [1] "relu" "tanh" "elu" "log_sigmoid" "tanhshrink" + + + [[5]] + [[5]]$pkg + [1] "dials" + + [[5]]$fun + [1] "activation" + + [[5]]$values + [1] "relu" "tanh" "elu" "log_sigmoid" "tanhshrink" + + + [[6]] + [[6]]$pkg + [1] "dials" + + [[6]]$fun + [1] "penalty" + + + [[7]] + [[7]]$pkg + [1] "dials" + + [[7]]$fun + [1] "dropout" + + + [[8]] + [[8]]$pkg + [1] "dials" + + [[8]]$fun + [1] "learn_rate" + + [[8]]$range + [1] -3.0 -0.2 + + + [[9]] + [[9]]$pkg + [1] "dials" + + [[9]]$fun + [1] "momentum" + + [[9]]$range + [1] 0.50 0.95 + + + [[10]] + [[10]]$pkg + [1] "dials" + + [[10]]$fun + [1] "batch_size" + + + [[11]] + [[11]]$pkg + [1] "dials" + + [[11]]$fun + [1] "stop_iter" + + + [[12]] + [[12]]$pkg + [1] "dials" + + [[12]]$fun + [1] "class_weights" + + + diff --git a/tests/testthat/test-mlp-multinomial.R b/tests/testthat/test-mlp-multinomial.R index 745fbab..99e13f1 100644 --- a/tests/testthat/test-mlp-multinomial.R +++ b/tests/testthat/test-mlp-multinomial.R @@ -30,7 +30,7 @@ test_that("basic multinomial mlp LBFGS", { mnl_fit_lbfgs <- brulee_mlp(class ~ ., mnl_tr, - epochs = 200, + epochs = 10, hidden_units = 5, rate_schedule = "cyclic", learn_rate = 0.1)}, diff --git a/tests/testthat/test-mlp-regression.R b/tests/testthat/test-mlp-regression.R index 18b1eef..47da8d8 100644 --- a/tests/testthat/test-mlp-regression.R +++ b/tests/testthat/test-mlp-regression.R @@ -327,7 +327,6 @@ test_that('bad args', { test_that("mlp learns something", { skip_if(!torch::torch_is_installed()) - skip_on_os("mac", arch = "aarch64") # ------------------------------------------------------------------------------ @@ -350,7 +349,6 @@ test_that("mlp learns something", { }) test_that("variable hidden_units length", { skip_if(!torch::torch_is_installed()) - skip_on_os("mac", arch = "aarch64") x <- data.frame(x = rnorm(1000)) y <- 2 * x$x @@ -376,3 +374,108 @@ test_that("variable hidden_units length", { ) }) + + +test_that('two-layer networks', { + skip_if(!torch::torch_is_installed()) + + skip_if_not_installed("modeldata") + skip_if_not_installed("yardstick") + skip_if_not_installed("recipes") + + suppressPackageStartupMessages(library(dplyr)) + suppressPackageStartupMessages(library(recipes)) + + # ------------------------------------------------------------------------------ + + set.seed(585) + reg_tr <- modeldata::sim_regression(5000) + reg_te <- modeldata::sim_regression(1000) + + reg_tr_x_df <- reg_tr[, -1] + reg_tr_x_mat <- as.matrix(reg_tr_x_df) + reg_tr_y <- reg_tr$outcome + + reg_rec <- + recipe(outcome ~ ., data = reg_tr) %>% + step_normalize(all_predictors()) + + # ------------------------------------------------------------------------------ + + # matrix x + expect_error({ + set.seed(1) + mlp_reg_mat_two_fit <- + brulee_mlp_two_layer( + reg_tr_x_mat, + reg_tr_y, + mixture = 0, + learn_rate = .1, + hidden_units = 5, + hidden_units_2 = 10, + activation = "relu", + activation_2 = "elu" + ) + }, + regex = NA) + + expect_error({ + set.seed(1) + mlp_reg_mat_two_check_fit <- + brulee_mlp( + reg_tr_x_mat, + reg_tr_y, + mixture = 0, + learn_rate = .1, + hidden_units = c(5, 10), + activation = c("relu", "elu") + ) + }, + regex = NA) + + expect_equal(mlp_reg_mat_two_fit$loss, mlp_reg_mat_two_check_fit$loss) + + # data frame x (all numeric) + expect_error( + mlp_reg_df_two_fit <- + brulee_mlp_two_layer( + reg_tr_x_df, + reg_tr_y, + validation = .2, + hidden_units = 5, + hidden_units_2 = 10, + activation = "celu", + activation_2 = "gelu" + ), + regex = NA + ) + + # formula (mixed) + expect_error({ + set.seed(8373) + mlp_reg_f_two_fit <- brulee_mlp_two_layer( + outcome ~ ., + reg_tr, + hidden_units = 5, + hidden_units_2 = 10, + activation = "hardshrink", + activation_2 = "hardsigmoid" + ) + }, + regex = NA) + + # recipe + expect_error({ + set.seed(8373) + mlp_reg_rec_two_fit <- brulee_mlp_two_layer( + reg_rec, + reg_tr, + hidden_units = 5, + hidden_units_2 = 10, + activation = "hardtanh", + activation_2 = "sigmoid" + ) + }, + regex = NA) + +}) diff --git a/tests/testthat/test-overflow.R b/tests/testthat/test-overflow.R new file mode 100644 index 0000000..4dca35a --- /dev/null +++ b/tests/testthat/test-overflow.R @@ -0,0 +1,20 @@ + + +test_that("NaN loss due to overflow", { + skip_if_not_installed("modeldata") + skip_on_os(c("windows", "linux", "solaris")) + skip_on_os("mac", arch = "x86_64") + + i <- 81872 + set.seed(i) + data_tr <- modeldata::sim_logistic(200, ~ .1 + 2 * A - 3 * B + 1 * A *B, corr = .7) + + expect_snapshot_warning({ + set.seed(i+1) + mlp_fit <- brulee_mlp(class ~ ., data = data_tr, hidden_units = 10, + stop_iter = Inf) + }) + expect_snapshot(print(mlp_fit)) + expect_equal(length(mlp_fit$estimates), 9) + +}) diff --git a/tests/testthat/test-tunable.R b/tests/testthat/test-tunable.R new file mode 100644 index 0000000..6dba141 --- /dev/null +++ b/tests/testthat/test-tunable.R @@ -0,0 +1,8 @@ + +test_that("tunable values", { + expect_snapshot(brulee:::tunable.brulee_linear_reg(1)$call_info) + expect_snapshot(brulee:::tunable.brulee_logistic_reg(1)$call_info) + expect_snapshot(brulee:::tunable.brulee_multinomial_reg(1)$call_info) + expect_snapshot(brulee:::tunable.brulee_mlp(1)$call_info) + expect_snapshot(brulee:::tunable.brulee_mlp_two_layer(1)$call_info) +})