options(repr.plot.width = 5, repr.plot.height = 5) ## resizing plots options(scipen = 999) ## has scientific notation ever annoyed you? library(here) library(usethis) library(tidyverse) library(data.table, quietly = TRUE) library(earth) library(glmnet) library(randomForest) library(hal9001) library(sl3) set.seed(385971) n = 1000 p = 3 x <- cbind(matrix(rnorm(n * p), n, p), rbinom(n, 1, 0.61)) y <- sin(x[, 1]) * sin(x[, 2]) + x[, 1]^2 * (x[, 1] > x[, 2]) + x[, 3] * x[,4] + rnorm(n, mean = 0, sd = 0.3) data <- as.data.table(cbind(x, y)) setnames(data, c(paste0("X", seq_len(p + 1)), "Y")) head(data) GGally::ggpairs(data) # Intercept GLM mean_fit <- mean(y) mean_pred <- rep(mean_fit, length(y)) (mse_mean <- mean((y - mean_pred)^2)) # Simple GLM glm_fit <- glm(Y ~ ., data = data) glm_pred <- predict(glm_fit) (mse_glm <- mean((y - glm_pred)^2)) # Ridge (L2-penalized) regression ridge_fit <- glmnet(x = x, y = y, alpha = 0) ridge_pred <- predict(ridge_fit, newx = x) (mse_ridge <- mean((y - ridge_pred)^2)) # Lasso (L1-penalized) regression lasso_fit <- glmnet(x = x, y = y, alpha = 1) lasso_pred <- predict(lasso_fit, newx = x) (mse_lasso <- mean((y - lasso_pred)^2)) # Multivariate Adaptive Regression Splines (MARS) mars_fit <- earth(Y ~ ., data = data) mars_pred <- predict(mars_fit) (mse_mars <- mean((y - mars_pred)^2)) # Random Forest rf_fit <- randomForest(Y ~ ., data = data) rf_pred <- predict(rf_fit) (mse_rf <- mean((y - rf_pred)^2)) # Highly Adaptive LASSO (HAL) hal_fit <- fit_hal(X = x, Y = y) hal_pred <- predict(hal_fit, new_data = x) (mse_hal <- mean((y - hal_pred)^2)) task <- sl3_Task$new(data, covariates = c(paste0("X", seq_len(p + 1))), outcome = "Y") task lrnr_stack <- make_learner_stack("Lrnr_mean", "Lrnr_glm_fast", "Lrnr_randomForest") lrnr_stack sl <- Lrnr_sl$new(learners = list(lrnr_stack), metalearner = Lrnr_nnls$new()) sl_fit <- sl$train(task) preds_sl <- sl_fit$predict() (mse_sl <- mean((y - preds_sl)^2)) sl_fit lrnr_stack_hal <- make_learner_stack("Lrnr_mean", "Lrnr_glm_fast", "Lrnr_randomForest", "Lrnr_hal9001") sl_hal <- Lrnr_sl$new(learners = list(lrnr_stack_hal), metalearner = Lrnr_nnls$new()) sl_hal_fit <- sl_hal$train(task) preds_sl_hal <- sl_hal_fit$predict() (mse_sl_hal <- mean((y - preds_sl_hal)^2)) sl_hal_fit