modeltime.h2o icon indicating copy to clipboard operation
modeltime.h2o copied to clipboard

Develop H2O Regression Algorithms

Open mdancho84 opened this issue 3 years ago • 9 comments

Here's a minimal example based on Shafi's code. We can convert this into tidymodels format once we agree on the process being shown.

# MVP EXAMPLE ----
# automl_reg() function

# Libraries ----
library(modeltime)
library(tidymodels)
library(h2o)
library(tidyverse)
library(timetk)

# Data ----
# - This is before modeltime

data_tbl <- walmart_sales_weekly %>% 
    select(id, Date, Weekly_Sales)

splits <- timetk::time_series_split(data_tbl, assess = "3 month", cumulative = TRUE)

recipe_spec <- recipe(Weekly_Sales ~ ., data = training(splits)) %>%
    step_timeseries_signature(Date) 

train_tbl <- rsample::training(splits) %>% bake(prep(recipe_spec), .)
test_tbl  <- rsample::testing(splits) %>% bake(prep(recipe_spec), .)


# H2O Initialization ----
# - User will set up H2O 
h2o.init(
    max_mem_size = "1000G", 
    nthreads = -1, 
    ip = "localhost", 
    port = 54321
)
#>  Connection successful!
#> 
#> R is connected to the H2O cluster: 
#>     H2O cluster uptime:         2 days 21 hours 
#>     H2O cluster timezone:       America/New_York 
#>     H2O data parsing timezone:  UTC 
#>     H2O cluster version:        3.32.0.1 
#>     H2O cluster version age:    4 months and 27 days !!! 
#>     H2O cluster name:           H2O_started_from_R_mdancho_gyx565 
#>     H2O cluster total nodes:    1 
#>     H2O cluster total memory:   7.90 GB 
#>     H2O cluster total cores:    12 
#>     H2O cluster allowed cores:  12 
#>     H2O cluster healthy:        TRUE 
#>     H2O Connection ip:          localhost 
#>     H2O Connection port:        54321 
#>     H2O Connection proxy:       NA 
#>     H2O Internal Security:      FALSE 
#>     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
#>     R Version:                  R version 4.0.2 (2020-06-22)
#> Warning in h2o.clusterInfo(): 
#> Your H2O cluster version is too old (4 months and 27 days)!
#> Please download and install the latest version from http://h2o.ai/download/

# MODELTING WORKFLOW ----
# - This is where Modeltime H2O takes over

# Spec - Package API will handle with 

# * automl_reg() %>% set_engine("h2o") ----
# - I doubt this function needs hyperparams
# - Users can use set_engine() to specify any args


# * fit() ----
# - Will handle preparing as H2O Frame, training the automl, storing either a leaderboard or a subset of models

# ** Prep data 
train_tbl <- train_tbl %>%
    # H2O doesn't like ordered factors
    mutate_if(is.ordered, function(x) factor(x, ordered = FALSE))

# ** Convert to H2O Frame
train_h2o <- as.h2o(train_tbl)
#>   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "Weekly_Sales"
x <- setdiff(names(train_h2o), y)

aml_results <- h2o.automl(
    x = x, y = y, 
    
    # Data Specifications - 
    # - I recommend to only use a Training Frame 
    # - This lets CV do the validation
    training_frame = train_h2o, 
    # validation_frame = valid,
    # leaderboard_frame = test, 
    
    # User Defined Args
    max_runtime_secs = 30, 
    max_runtime_secs_per_model = 30,
    
    project_name = 'project_01',
    
    nfolds        = 5,
    max_models    = 1000,
    exclude_algos = c("DeepLearning"),
    seed          =  786
)
#>   |                                                                              |                                                                      |   0%
#> 06:58:55.409: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 23 models.
#> 06:59:06.476: StackedEnsemble_BestOfFamily_AutoML_20210308_065855 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 06:59:07.481: StackedEnsemble_AllModels_AutoML_20210308_065855 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:09:49.460: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 33 models.
#> 07:10:00.529: StackedEnsemble_BestOfFamily_AutoML_20210308_070949 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:10:01.536: StackedEnsemble_AllModels_AutoML_20210308_070949 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:19:21.102: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 43 models.
#> 07:19:32.172: StackedEnsemble_BestOfFamily_AutoML_20210308_071921 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:19:33.178: StackedEnsemble_AllModels_AutoML_20210308_071921 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:20:19.518: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 53 models.
#> 07:20:46.723: StackedEnsemble_BestOfFamily_AutoML_20210308_072019 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:20:47.728: StackedEnsemble_AllModels_AutoML_20210308_072019 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:31:42.782: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 108 models.
#> 07:32:10.987: StackedEnsemble_BestOfFamily_AutoML_20210308_073142 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:32:11.993: StackedEnsemble_AllModels_AutoML_20210308_073142 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:33:52.726: New models will be added to existing leaderboard project_01@@Weekly_Sales (leaderboard frame=null) with already 148 models.  |                                                                              |=======                                                               |  10%  |                                                                              |===========                                                           |  16%  |                                                                              |===============                                                       |  21%  |                                                                              |==================                                                    |  26%  |                                                                              |======================                                                |  32%  |                                                                              |==========================                                            |  37%  |                                                                              |=============================                                         |  42%  |                                                                              |=================================                                     |  47%  |                                                                              |====================================                                  |  52%  |                                                                              |========================================                              |  57%  |                                                                              |============================================                          |  62%  |                                                                              |===============================================                       |  67%  |                                                                              |===================================================                   |  73%  |                                                                              |=======================================================               |  78%  |                                                                              |===========================================================           |  84%  |                                                                              |===============================================================       |  90%  |                                                                              |==================================================================    |  95%
#> 07:34:20.984: StackedEnsemble_BestOfFamily_AutoML_20210308_073352 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
#> 07:34:21.989: StackedEnsemble_AllModels_AutoML_20210308_073352 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.  |                                                                              |======================================================================| 100%

# Returns many models organized by a Leaderboard
aml_results
#> AutoML Details
#> ==============
#> Project Name: project_01 
#> Leader Model ID: XGBoost_grid__1_AutoML_20210308_073352_model_17 
#> Algorithm: xgboost 
#> 
#> Total Number of Models Trained: 190 
#> Start Time: 2021-03-08 07:33:53 UTC 
#> End Time: 2021-03-08 07:34:22 UTC 
#> Duration: 29 s
#> 
#> Leaderboard
#> ===========
#>                                           model_id mean_residual_deviance
#> 1  XGBoost_grid__1_AutoML_20210308_073352_model_17               32430887
#> 2  XGBoost_grid__1_AutoML_20210308_073142_model_17               32430887
#> 3   XGBoost_grid__1_AutoML_20210308_073352_model_3               34331133
#> 4   XGBoost_grid__1_AutoML_20210308_073142_model_3               34331133
#> 5  XGBoost_grid__1_AutoML_20210308_073352_model_10               35345362
#> 6  XGBoost_grid__1_AutoML_20210308_073142_model_10               35345362
#> 7                     GBM_1_AutoML_20210308_073352               35463702
#> 8                     GBM_1_AutoML_20210308_073142               35463702
#> 9  XGBoost_grid__1_AutoML_20210308_073352_model_20               35975536
#> 10                XGBoost_3_AutoML_20210308_073352               36178176
#>        rmse      mse      mae     rmsle
#> 1  5694.812 32430887 3284.597 0.1272254
#> 2  5694.812 32430887 3284.597 0.1272254
#> 3  5859.278 34331133 3628.706 0.1437346
#> 4  5859.278 34331133 3628.706 0.1437346
#> 5  5945.197 35345362 3672.445 0.1501596
#> 6  5945.197 35345362 3672.445 0.1501596
#> 7  5955.141 35463702 3634.803 0.1482343
#> 8  5955.141 35463702 3634.803 0.1482343
#> 9  5997.961 35975536 3649.308 0.1425722
#> 10 6014.830 36178176 3731.129 0.1543888
#> 
#> [190 rows x 6 columns]


# View the AutoML Leaderboard
lb <- aml_results@leaderboard

# Get the best model ID
model_id_lb_1 <- as_tibble(lb) %>% slice(1) %>% pull(model_id) 


# * predict() ----

# ** Prep data 
test_tbl <- test_tbl %>%
    # H2O doesn't like ordered factors
    mutate_if(is.ordered, function(x) factor(x, ordered = FALSE))

test_h2o   <- as.h2o(test_tbl)
#>   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

model_lb_1 <- h2o.getModel(model_id_lb_1)

preds <- predict(model_lb_1, test_h2o) %>% as_tibble() %>% pull(predict)
#>   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%



# Results ----
test_tbl %>%
    mutate(preds = preds) %>%
    pivot_longer(cols = c(Weekly_Sales, preds)) %>%
    group_by(id) %>%
    plot_time_series(
        Date, value, .color_var = name, 
        .smooth = FALSE,
        .facet_ncol = 2,
        .interactive = FALSE
    )

Created on 2021-03-08 by the reprex package (v1.0.0)

mdancho84 avatar Mar 08 '21 12:03 mdancho84