modeltime.h2o
modeltime.h2o copied to clipboard
Develop H2O Regression Algorithms
Here's a minimal example based on Shafi's code. We can convert this into tidymodels
format once we agree on the process being shown.
# MVP EXAMPLE ----
# automl_reg() function
# Libraries ----
library(modeltime)
library(tidymodels)
library(h2o)
library(tidyverse)
library(timetk)
# Data ----
# - This is before modeltime
data_tbl <- walmart_sales_weekly %>%
select(id, Date, Weekly_Sales)
splits <- timetk::time_series_split(data_tbl, assess = "3 month", cumulative = TRUE)
recipe_spec <- recipe(Weekly_Sales ~ ., data = training(splits)) %>%
step_timeseries_signature(Date)
train_tbl <- rsample::training(splits) %>% bake(prep(recipe_spec), .)
test_tbl <- rsample::testing(splits) %>% bake(prep(recipe_spec), .)
# H2O Initialization ----
# - User will set up H2O
h2o.init(
max_mem_size = "1000G",
nthreads = -1,
ip = "localhost",
port = 54321
)
#> Connection successful!
#>
#> R is connected to the H2O cluster:
#> H2O cluster uptime: 2 days 21 hours
#> H2O cluster timezone: America/New_York
#> H2O data parsing timezone: UTC
#> H2O cluster version: 3.32.0.1
#> H2O cluster version age: 4 months and 27 days !!!
#> H2O cluster name: H2O_started_from_R_mdancho_gyx565
#> H2O cluster total nodes: 1
#> H2O cluster total memory: 7.90 GB
#> H2O cluster total cores: 12
#> H2O cluster allowed cores: 12
#> H2O cluster healthy: TRUE
#> H2O Connection ip: localhost
#> H2O Connection port: 54321
#> H2O Connection proxy: NA
#> H2O Internal Security: FALSE
#> H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
#> R Version: R version 4.0.2 (2020-06-22)
#> Warning in h2o.clusterInfo():
#> Your H2O cluster version is too old (4 months and 27 days)!
#> Please download and install the latest version from http://h2o.ai/download/
# MODELTING WORKFLOW ----
# - This is where Modeltime H2O takes over
# Spec - Package API will handle with
# * automl_reg() %>% set_engine("h2o") ----
# - I doubt this function needs hyperparams
# - Users can use set_engine() to specify any args
# * fit() ----
# - Will handle preparing as H2O Frame, training the automl, storing either a leaderboard or a subset of models
# ** Prep data
train_tbl <- train_tbl %>%
# H2O doesn't like ordered factors
mutate_if(is.ordered, function(x) factor(x, ordered = FALSE))
# ** Convert to H2O Frame
train_h2o <- as.h2o(train_tbl)
#> | | | 0% | |======================================================================| 100%
y <- "Weekly_Sales"
x <- setdiff(names(train_h2o), y)
aml_results <- h2o.automl(
x = x, y = y,
# Data Specifications -
# - I recommend to only use a Training Frame
# - This lets CV do the validation
training_frame = train_h2o,
# validation_frame = valid,
# leaderboard_frame = test,
# User Defined Args
max_runtime_secs = 30,
max_runtime_secs_per_model = 30,
project_name = 'project_01',
nfolds = 5,
max_models = 1000,
exclude_algos = c("DeepLearning"),
seed = 786
)
#> | | | 0%
#> 06:58:55.409: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 23 models.
#> 06:59:06.476: StackedEnsemble_BestOfFamily_AutoML_20210308_065855 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 06:59:07.481: StackedEnsemble_AllModels_AutoML_20210308_065855 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:09:49.460: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 33 models.
#> 07:10:00.529: StackedEnsemble_BestOfFamily_AutoML_20210308_070949 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:10:01.536: StackedEnsemble_AllModels_AutoML_20210308_070949 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:19:21.102: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 43 models.
#> 07:19:32.172: StackedEnsemble_BestOfFamily_AutoML_20210308_071921 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:19:33.178: StackedEnsemble_AllModels_AutoML_20210308_071921 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:20:19.518: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 53 models.
#> 07:20:46.723: StackedEnsemble_BestOfFamily_AutoML_20210308_072019 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:20:47.728: StackedEnsemble_AllModels_AutoML_20210308_072019 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:31:42.782: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 108 models.
#> 07:32:10.987: StackedEnsemble_BestOfFamily_AutoML_20210308_073142 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:32:11.993: StackedEnsemble_AllModels_AutoML_20210308_073142 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:33:52.726: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 148 models. | |======= | 10% | |=========== | 16% | |=============== | 21% | |================== | 26% | |====================== | 32% | |========================== | 37% | |============================= | 42% | |================================= | 47% | |==================================== | 52% | |======================================== | 57% | |============================================ | 62% | |=============================================== | 67% | |=================================================== | 73% | |======================================================= | 78% | |=========================================================== | 84% | |=============================================================== | 90% | |================================================================== | 95%
#> 07:34:20.984: StackedEnsemble_BestOfFamily_AutoML_20210308_073352 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:34:21.989: StackedEnsemble_AllModels_AutoML_20210308_073352 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . . Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted. | |======================================================================| 100%
# Returns many models organized by a Leaderboard
aml_results
#> AutoML Details
#> ==============
#> Project Name: project_01
#> Leader Model ID: XGBoost_grid__1_AutoML_20210308_073352_model_17
#> Algorithm: xgboost
#>
#> Total Number of Models Trained: 190
#> Start Time: 2021-03-08 07:33:53 UTC
#> End Time: 2021-03-08 07:34:22 UTC
#> Duration: 29 s
#>
#> Leaderboard
#> ===========
#> model_id mean_residual_deviance
#> 1 XGBoost_grid__1_AutoML_20210308_073352_model_17 32430887
#> 2 XGBoost_grid__1_AutoML_20210308_073142_model_17 32430887
#> 3 XGBoost_grid__1_AutoML_20210308_073352_model_3 34331133
#> 4 XGBoost_grid__1_AutoML_20210308_073142_model_3 34331133
#> 5 XGBoost_grid__1_AutoML_20210308_073352_model_10 35345362
#> 6 XGBoost_grid__1_AutoML_20210308_073142_model_10 35345362
#> 7 GBM_1_AutoML_20210308_073352 35463702
#> 8 GBM_1_AutoML_20210308_073142 35463702
#> 9 XGBoost_grid__1_AutoML_20210308_073352_model_20 35975536
#> 10 XGBoost_3_AutoML_20210308_073352 36178176
#> rmse mse mae rmsle
#> 1 5694.812 32430887 3284.597 0.1272254
#> 2 5694.812 32430887 3284.597 0.1272254
#> 3 5859.278 34331133 3628.706 0.1437346
#> 4 5859.278 34331133 3628.706 0.1437346
#> 5 5945.197 35345362 3672.445 0.1501596
#> 6 5945.197 35345362 3672.445 0.1501596
#> 7 5955.141 35463702 3634.803 0.1482343
#> 8 5955.141 35463702 3634.803 0.1482343
#> 9 5997.961 35975536 3649.308 0.1425722
#> 10 6014.830 36178176 3731.129 0.1543888
#>
#> [190 rows x 6 columns]
# View the AutoML Leaderboard
lb <- aml_results@leaderboard
# Get the best model ID
model_id_lb_1 <- as_tibble(lb) %>% slice(1) %>% pull(model_id)
# * predict() ----
# ** Prep data
test_tbl <- test_tbl %>%
# H2O doesn't like ordered factors
mutate_if(is.ordered, function(x) factor(x, ordered = FALSE))
test_h2o <- as.h2o(test_tbl)
#> | | | 0% | |======================================================================| 100%
model_lb_1 <- h2o.getModel(model_id_lb_1)
preds <- predict(model_lb_1, test_h2o) %>% as_tibble() %>% pull(predict)
#> | | | 0% | |======================================================================| 100%
# Results ----
test_tbl %>%
mutate(preds = preds) %>%
pivot_longer(cols = c(Weekly_Sales, preds)) %>%
group_by(id) %>%
plot_time_series(
Date, value, .color_var = name,
.smooth = FALSE,
.facet_ncol = 2,
.interactive = FALSE
)
Created on 2021-03-08 by the reprex package (v1.0.0)