Feature Request: Additional documentation / examples on using `recipes`
I'm not sure if I am doing something wrong but when I attempt to follow the example on the page I get an error when using recipes, I suspect that this has to do with the functions add_variables passing columns as-is to the model fitting function, however, if we use step_dummy or step_pca the columns will change.
library('lme4')
#> Loading required package: Matrix
library('multilevelmod')
#> Loading required package: parsnip
library('workflowsets')
library('tidymodels')
tidymodels_prefer()
riesby
#> # A tibble: 250 × 7
#> subject depr_score week male endogenous imipramine desipramine
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 101 -8 0 0 0 4.04 4.20
#> 2 101 -19 1 0 0 3.93 4.81
#> 3 101 -22 2 0 0 4.33 4.96
#> 4 101 -23 3 0 0 4.37 4.96
#> 5 103 -18 0 1 0 2.77 5.24
#> 6 103 -9 1 1 0 3.47 5.21
#> 7 103 -18 2 1 0 3.53 5.34
#> 8 103 -20 3 1 0 3.58 5.36
#> 9 104 -11 0 1 1 5.34 4.75
#> 10 104 -16 1 1 1 5.75 5.06
#> # ℹ 240 more rows
splits <- group_initial_split(riesby, group = subject)
folds <- group_vfold_cv(training(splits), v = 5, group = subject)
rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
add_role(subject, new_role = "exp_unit") %>%
update_role(week, new_role = "id variable") %>%
step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), num_comp = tune())
lmer_spec <-
linear_reg() %>%
set_engine("lmer")
lmer_wflow <-
workflow() %>%
add_variables(outcomes = depr_score, predictors = c(male, endogenous, imipramine, desipramine, subject)) %>%
add_model(lmer_spec, formula = depr_score ~ male + endogenous + imipramine + desipramine + (1|subject))
grid_results <- tune_grid(
lmer_wflow,
resamples = folds,
grid = 10,
metrics = metric_set(rmse),
control = control_grid(save_pred = T, save_workflow = TRUE)
)
#> Warning: No tuning parameters have been detected, performance will be evaluated
#> using the resamples with no tuning. Did you want to [tune()] parameters?
collect_predictions(grid_results)
#> # A tibble: 187 × 5
#> .pred id .row depr_score .config
#> <dbl> <chr> <int> <dbl> <chr>
#> 1 -6.14 Resample1 5 -6 Preprocessor1_Model1
#> 2 -6.69 Resample1 6 -6 Preprocessor1_Model1
#> 3 -5.40 Resample1 7 -9 Preprocessor1_Model1
#> 4 -7.04 Resample1 8 -13 Preprocessor1_Model1
#> 5 -7.36 Resample1 23 -6 Preprocessor1_Model1
#> 6 -8.39 Resample1 24 -7 Preprocessor1_Model1
#> 7 -9.80 Resample1 25 -12 Preprocessor1_Model1
#> 8 -10.3 Resample1 26 -13 Preprocessor1_Model1
#> 9 -6.23 Resample1 27 -8 Preprocessor1_Model1
#> 10 -7.15 Resample1 28 -8 Preprocessor1_Model1
#> # ℹ 177 more rows
collect_metrics(grid_results)
#> # A tibble: 1 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 rmse standard 7.35 5 0.687 Preprocessor1_Model1
final_model <- fit_best(grid_results)
final_model
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: depr_score
#> Predictors: c(male, endogenous, imipramine, desipramine, subject)
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: depr_score ~ male + endogenous + imipramine + desipramine + (1 |
#> subject)
#> Data: data
#> REML criterion at convergence: 1177.133
#> Random effects:
#> Groups Name Std.Dev.
#> subject (Intercept) 5.577
#> Residual 4.585
#> Number of obs: 187, groups: subject, 50
#> Fixed Effects:
#> (Intercept) male endogenous imipramine desipramine
#> 13.7276 0.6072 1.6161 -0.8234 -4.2021
Now try with recipe
lmer_wflow_rec <- lmer_wflow %>%
remove_variables() %>%
add_recipe(rec)
grid_results_rec <- tune_grid(
lmer_wflow_rec,
resamples = folds,
grid = 10,
metrics = metric_set(rmse),
control = control_grid(save_pred = T, save_workflow = TRUE)
)
#> → A | error: object 'male' not found
#> There were issues with some computations A: x1There were issues with some computations A: x2There were issues with some computations A: x5There were issues with some computations A: x9There were issues with some computations A: x11There were issues with some computations A: x14There were issues with some computations A: x17There were issues with some computations A: x20There were issues with some computations A: x20
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.
I still can't figure out if it's possible to perform tuning
library('lme4')
Loading required package: Matrix
library('multilevelmod')
Loading required package: parsnip
library('workflowsets')
library('tidymodels')
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom 1.0.7 ✔ recipes 1.1.1
✔ dials 1.3.0 ✔ rsample 1.2.1
✔ dplyr 1.1.4 ✔ tibble 3.2.1
✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
✔ infer 1.0.7 ✔ tune 1.2.1
✔ modeldata 1.4.0 ✔ workflows 1.1.4
✔ purrr 1.0.4 ✔ yardstick 1.3.2
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ tidyr::expand() masks Matrix::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ tidyr::pack() masks Matrix::pack()
✖ recipes::step() masks stats::step()
✖ tidyr::unpack() masks Matrix::unpack()
✖ recipes::update() masks Matrix::update(), stats::update()
• Search for functions across packages at https://www.tidymodels.org/find/
tidymodels_prefer()
riesby |> head()
# A tibble: 6 × 7
subject depr_score week male endogenous imipramine desipramine
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 101 -8 0 0 0 4.04 4.20
2 101 -19 1 0 0 3.93 4.81
3 101 -22 2 0 0 4.33 4.96
4 101 -23 3 0 0 4.37 4.96
5 103 -18 0 1 0 2.77 5.24
6 103 -9 1 1 0 3.47 5.21
splits <- group_initial_split(riesby, group = subject)
folds <- group_vfold_cv(training(splits), v = 5, group = subject)
rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
add_role(subject, new_role = "exp_unit") %>%
update_role(week, new_role = "id variable") %>%
step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), threshold = 1)
prep_rec <- prep(rec, training = training(splits) %>% filter(week == 0))
n_comp <- tidy(prep_rec, 2, type ='variance') %>%
select(component) %>%
filter(component == max(component)) %>%
pull()
tidy(prep_rec, 2, type ='variance') %>%
filter(terms == 'cumulative percent variance' | terms == 'percent variance') %>%
mutate(variance = factor(terms, levels = c('cumulative percent variance', 'percent variance'), labels = c("cumulative","percent"))) %>%
mutate(PC = component) %>%
mutate(percent = value/100) %>%
ggplot(aes(x=PC, y=percent, fill=variance, alpha = variance)) +
geom_bar(stat = "identity",
position = "identity") +
scale_fill_manual(values =
c("cumulative" = "#56B4E9",
"percent" = "black")) +
scale_x_continuous(breaks = 1:n_comp) +
scale_y_continuous(
breaks = seq(0, 1, 0.1),
labels = scales::percent_format()
) +
scale_alpha_manual(values = c(.75,1)) +
labs(
x = "Principal Component",
y = "Percent of Variance Explained",
title = "Variance Explained by Principal Component"
) +
geom_text(aes(label = scales::percent(percent)),
#position = position_stack(),
size = 3)
rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
add_role(subject, new_role = "exp_unit") %>%
update_role(week, new_role = "id variable") %>%
step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), num_comp = 3)
lmer_spec <- linear_reg() %>%
set_engine("lmer")
lmer_wflow <- workflow() %>%
add_variables(outcomes = depr_score, predictors = c(starts_with("PC"), c(subject))) %>%
add_model(lmer_spec, formula = depr_score ~ PC1 + PC2 + PC3 + (1|subject))
lmer_wflow2 <- lmer_wflow %>%
remove_variables() %>%
add_recipe(rec)
lmer_fits <- tune_grid(
lmer_wflow2,
resamples = folds,
grid = 10,
control = control_grid(save_pred = TRUE)
)
Warning: No tuning parameters have been detected, performance will be evaluated
using the resamples with no tuning. Did you want to [tune()] parameters?
collect_predictions(lmer_fits)
# A tibble: 188 × 5
.pred id .row depr_score .config
<dbl> <chr> <int> <dbl> <chr>
1 -5.56 Resample1 80 -10 Preprocessor1_Model1
2 -5.97 Resample1 81 -12 Preprocessor1_Model1
3 -6.55 Resample1 82 -21 Preprocessor1_Model1
4 -5.58 Resample1 83 -20 Preprocessor1_Model1
5 -12.3 Resample1 88 -1 Preprocessor1_Model1
6 -13.1 Resample1 89 -11 Preprocessor1_Model1
7 -13.6 Resample1 90 -23 Preprocessor1_Model1
8 -4.11 Resample1 99 8 Preprocessor1_Model1
9 -6.03 Resample1 100 -9 Preprocessor1_Model1
10 -4.99 Resample1 101 3 Preprocessor1_Model1
# ℹ 178 more rows
👋 Which parameter(s) would you like to tune? Generally speaking, you need to set a placeholder (tune()) for those parameters in the model spec and/or the recipe like in this example: https://www.tidymodels.org/start/tuning/#tuning
Hey, @hfrick, thanks for responding.
Ideally, in this example, the number of components; however, the formula in add_model requires that you pass in explicit information. This is different from other tidy models where the tune_grid will account for this and update the formula.
In the examples provided, we need to have an explicit formula to specify the random effects for lmer.
This is my source of confusion. When you apply step_dumny or step_pca, the variables change, which breaks the reference required by add_model to pass in an explicit formula. Dummy variable creation in one thing in comparison to the number of principal components: the dummy variables will be created in the same way so you can account for it within the formula, the number of components will change based on the tuning parameters that tune_grid creates or you explicitly provide it.
So, in an instance such as this, do I need to forecast out the various formulas and essentially recreate what tune_grid would normally do?
I am using the dataset you are using as a reference, but I am currently trying to implement it within my research, which has protected data, so unfortunately, I can't share my git.
This is why I'm saying that if there is something that I'm missing, additional documentation would be helpful.
Thank you for your thoughts on this.
Thanks for the clarification. You are very close!
The piece you're missing is to use the dot in the model formula to avoid having to specify the PCA columns directly. You combine the dot with the specification of the random effect. Since the dot essentially includes main effects of "everything else" other than the outcome, you'll also need to remove the fixed effect for the subjects which I asssume you don't want here. So what you land at is formula = depr_score ~ . + (1 | subject) - subject.
The other pieces you already had in place: a recipe so that you can set num_comp = tune() and specifying that model formula via add_model(formula).
library(tidymodels)
library(multilevelmod)
tidymodels_prefer()
set.seed(1)
riesby_split <- group_initial_split(riesby, group = subject)
riesby_train <- training(riesby_split)
folds <- group_vfold_cv(riesby_train, v = 5, group = subject)
rec <- recipe(depr_score ~ ., data = riesby_train) %>%
add_role(subject, new_role = "exp_unit") %>%
update_role(week, new_role = "id variable") %>%
step_normalize(
all_numeric_predictors(),
-has_role("exp_unit")
) %>%
step_pca(
all_numeric_predictors(),
-has_role("exp_unit"),
num_comp = tune()
)
lmer_spec <-
linear_reg() %>%
set_engine("lmer")
lmer_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(
lmer_spec,
formula = depr_score ~ . + (1 | subject) - subject
)
grid_results <- tune_grid(
lmer_wflow,
resamples = folds,
grid = 10,
metrics = metric_set(rmse),
control = control_grid(save_pred = TRUE, save_workflow = TRUE)
)
show_best(grid_results, metric = "rmse")
#> # A tibble: 4 × 7
#> num_comp .metric .estimator mean n std_err .config
#> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 4 rmse standard 6.61 5 0.661 Preprocessor4_Model1
#> 2 2 rmse standard 6.75 5 0.597 Preprocessor2_Model1
#> 3 3 rmse standard 6.80 5 0.590 Preprocessor3_Model1
#> 4 1 rmse standard 6.80 5 0.596 Preprocessor1_Model1
Created on 2025-02-22 with reprex v2.1.1
Hey @hfrick thanks for the explanation, I'm not sure why I didn't think to try this out.
Happy to help! I'll leave this issue open as a reminder to take a look at how we could improve the docs here, when we next pick up multilevelmod.
Thanks @hfrick , I also think I added lmerTest::lmer as an option in a pull request. I might be missing something but it seemed easy enough to just follow what was done with lme4::lmer