multilevelmod icon indicating copy to clipboard operation
multilevelmod copied to clipboard

Feature Request: Additional documentation / examples on using `recipes`

Open jkylearmstrong opened this issue 10 months ago • 7 comments

I'm not sure if I am doing something wrong but when I attempt to follow the example on the page I get an error when using recipes, I suspect that this has to do with the functions add_variables passing columns as-is to the model fitting function, however, if we use step_dummy or step_pca the columns will change.

library('lme4')
#> Loading required package: Matrix
library('multilevelmod')
#> Loading required package: parsnip
library('workflowsets')
library('tidymodels')
tidymodels_prefer()

riesby
#> # A tibble: 250 × 7
#>    subject depr_score  week  male endogenous imipramine desipramine
#>    <fct>        <dbl> <dbl> <dbl>      <dbl>      <dbl>       <dbl>
#>  1 101             -8     0     0          0       4.04        4.20
#>  2 101            -19     1     0          0       3.93        4.81
#>  3 101            -22     2     0          0       4.33        4.96
#>  4 101            -23     3     0          0       4.37        4.96
#>  5 103            -18     0     1          0       2.77        5.24
#>  6 103             -9     1     1          0       3.47        5.21
#>  7 103            -18     2     1          0       3.53        5.34
#>  8 103            -20     3     1          0       3.58        5.36
#>  9 104            -11     0     1          1       5.34        4.75
#> 10 104            -16     1     1          1       5.75        5.06
#> # ℹ 240 more rows

splits <- group_initial_split(riesby, group = subject)

folds <- group_vfold_cv(training(splits), v = 5, group = subject)

rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
  add_role(subject, new_role = "exp_unit") %>%
  update_role(week, new_role = "id variable") %>%
  step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
  step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), num_comp = tune())

lmer_spec <- 
  linear_reg() %>% 
  set_engine("lmer")

lmer_wflow <- 
  workflow() %>% 
  add_variables(outcomes = depr_score, predictors = c(male, endogenous, imipramine, desipramine, subject)) %>% 
  add_model(lmer_spec, formula = depr_score ~ male + endogenous + imipramine + desipramine + (1|subject))



grid_results <- tune_grid(
  lmer_wflow,
  resamples = folds,
  grid = 10,
  metrics = metric_set(rmse),
  control = control_grid(save_pred = T, save_workflow = TRUE)
)
#> Warning: No tuning parameters have been detected, performance will be evaluated
#> using the resamples with no tuning. Did you want to [tune()] parameters?

collect_predictions(grid_results)
#> # A tibble: 187 × 5
#>     .pred id         .row depr_score .config             
#>     <dbl> <chr>     <int>      <dbl> <chr>               
#>  1  -6.14 Resample1     5         -6 Preprocessor1_Model1
#>  2  -6.69 Resample1     6         -6 Preprocessor1_Model1
#>  3  -5.40 Resample1     7         -9 Preprocessor1_Model1
#>  4  -7.04 Resample1     8        -13 Preprocessor1_Model1
#>  5  -7.36 Resample1    23         -6 Preprocessor1_Model1
#>  6  -8.39 Resample1    24         -7 Preprocessor1_Model1
#>  7  -9.80 Resample1    25        -12 Preprocessor1_Model1
#>  8 -10.3  Resample1    26        -13 Preprocessor1_Model1
#>  9  -6.23 Resample1    27         -8 Preprocessor1_Model1
#> 10  -7.15 Resample1    28         -8 Preprocessor1_Model1
#> # ℹ 177 more rows

collect_metrics(grid_results)
#> # A tibble: 1 × 6
#>   .metric .estimator  mean     n std_err .config             
#>   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard    7.35     5   0.687 Preprocessor1_Model1

final_model <- fit_best(grid_results)

final_model
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Variables
#> Model: linear_reg()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> Outcomes: depr_score
#> Predictors: c(male, endogenous, imipramine, desipramine, subject)
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: depr_score ~ male + endogenous + imipramine + desipramine + (1 |  
#>     subject)
#>    Data: data
#> REML criterion at convergence: 1177.133
#> Random effects:
#>  Groups   Name        Std.Dev.
#>  subject  (Intercept) 5.577   
#>  Residual             4.585   
#> Number of obs: 187, groups:  subject, 50
#> Fixed Effects:
#> (Intercept)         male   endogenous   imipramine  desipramine  
#>     13.7276       0.6072       1.6161      -0.8234      -4.2021

Now try with recipe

lmer_wflow_rec <- lmer_wflow %>%
  remove_variables() %>%
  add_recipe(rec)

grid_results_rec <- tune_grid(
  lmer_wflow_rec,
  resamples = folds,
  grid = 10,
  metrics = metric_set(rmse),
  control = control_grid(save_pred = T, save_workflow = TRUE)
)
#> → A | error:   object 'male' not found
#> There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x5There were issues with some computations   A: x9There were issues with some computations   A: x11There were issues with some computations   A: x14There were issues with some computations   A: x17There were issues with some computations   A: x20There were issues with some computations   A: x20
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.

jkylearmstrong avatar Feb 11 '25 16:02 jkylearmstrong

I still can't figure out if it's possible to perform tuning

library('lme4')

Loading required package: Matrix
library('multilevelmod')

Loading required package: parsnip
library('workflowsets')
library('tidymodels')

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom     1.0.7     ✔ recipes   1.1.1
✔ dials     1.3.0     ✔ rsample   1.2.1
✔ dplyr     1.1.4     ✔ tibble    3.2.1
✔ ggplot2   3.5.1     ✔ tidyr     1.3.1
✔ infer     1.0.7     ✔ tune      1.2.1
✔ modeldata 1.4.0     ✔ workflows 1.1.4
✔ purrr     1.0.4     ✔ yardstick 1.3.2
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard()  masks scales::discard()
✖ tidyr::expand()   masks Matrix::expand()
✖ dplyr::filter()   masks stats::filter()
✖ dplyr::lag()      masks stats::lag()
✖ tidyr::pack()     masks Matrix::pack()
✖ recipes::step()   masks stats::step()
✖ tidyr::unpack()   masks Matrix::unpack()
✖ recipes::update() masks Matrix::update(), stats::update()
• Search for functions across packages at https://www.tidymodels.org/find/
tidymodels_prefer()

riesby |> head()

# A tibble: 6 × 7
  subject depr_score  week  male endogenous imipramine desipramine
  <fct>        <dbl> <dbl> <dbl>      <dbl>      <dbl>       <dbl>
1 101             -8     0     0          0       4.04        4.20
2 101            -19     1     0          0       3.93        4.81
3 101            -22     2     0          0       4.33        4.96
4 101            -23     3     0          0       4.37        4.96
5 103            -18     0     1          0       2.77        5.24
6 103             -9     1     1          0       3.47        5.21
splits <- group_initial_split(riesby, group = subject)

folds <- group_vfold_cv(training(splits), v = 5, group = subject)

rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
  add_role(subject, new_role = "exp_unit") %>%
  update_role(week, new_role = "id variable") %>%
  step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
  step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), threshold = 1)

prep_rec <- prep(rec, training = training(splits) %>% filter(week == 0))

n_comp <- tidy(prep_rec, 2, type ='variance') %>%
  select(component) %>%
  filter(component == max(component)) %>%
  pull()

tidy(prep_rec, 2, type ='variance') %>%
  filter(terms == 'cumulative percent variance' | terms == 'percent variance') %>%
  mutate(variance = factor(terms, levels = c('cumulative percent variance', 'percent variance'), labels = c("cumulative","percent"))) %>%
  mutate(PC = component) %>%
  mutate(percent = value/100) %>%
  ggplot(aes(x=PC, y=percent, fill=variance, alpha = variance)) +
  geom_bar(stat = "identity",
           position = "identity") +
  scale_fill_manual(values = 
                      c("cumulative" = "#56B4E9", 
                        "percent" = "black")) +
  scale_x_continuous(breaks = 1:n_comp) +
  scale_y_continuous(
    breaks = seq(0, 1, 0.1),
    labels = scales::percent_format()
  ) +
  scale_alpha_manual(values = c(.75,1)) +
  labs(
    x = "Principal Component",
    y = "Percent of Variance Explained",
    title = "Variance Explained by Principal Component"
  ) +
  geom_text(aes(label = scales::percent(percent)), 
            #position = position_stack(), 
            size = 3)

Image

rec <- recipe(depr_score ~ ., data = training(splits) %>% filter(week == 0)) %>%
  add_role(subject, new_role = "exp_unit") %>%
  update_role(week, new_role = "id variable") %>%
  step_normalize(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit") ) %>%
  step_pca(all_numeric_predictors(), -all_outcomes(), -has_role("exp_unit"), num_comp = 3)

lmer_spec <- linear_reg() %>% 
  set_engine("lmer")

lmer_wflow <- workflow() %>% 
  add_variables(outcomes = depr_score, predictors = c(starts_with("PC"), c(subject))) %>% 
  add_model(lmer_spec, formula = depr_score ~ PC1 + PC2 + PC3 + (1|subject))

lmer_wflow2 <- lmer_wflow %>%
  remove_variables() %>%
  add_recipe(rec) 

lmer_fits <- tune_grid(
  lmer_wflow2,
  resamples = folds,
  grid = 10,
  control = control_grid(save_pred = TRUE)
)
Warning: No tuning parameters have been detected, performance will be evaluated
using the resamples with no tuning. Did you want to [tune()] parameters?
collect_predictions(lmer_fits)
# A tibble: 188 × 5
    .pred id         .row depr_score .config             
    <dbl> <chr>     <int>      <dbl> <chr>               
 1  -5.56 Resample1    80        -10 Preprocessor1_Model1
 2  -5.97 Resample1    81        -12 Preprocessor1_Model1
 3  -6.55 Resample1    82        -21 Preprocessor1_Model1
 4  -5.58 Resample1    83        -20 Preprocessor1_Model1
 5 -12.3  Resample1    88         -1 Preprocessor1_Model1
 6 -13.1  Resample1    89        -11 Preprocessor1_Model1
 7 -13.6  Resample1    90        -23 Preprocessor1_Model1
 8  -4.11 Resample1    99          8 Preprocessor1_Model1
 9  -6.03 Resample1   100         -9 Preprocessor1_Model1
10  -4.99 Resample1   101          3 Preprocessor1_Model1
# ℹ 178 more rows

jkylearmstrong avatar Feb 14 '25 22:02 jkylearmstrong

👋 Which parameter(s) would you like to tune? Generally speaking, you need to set a placeholder (tune()) for those parameters in the model spec and/or the recipe like in this example: https://www.tidymodels.org/start/tuning/#tuning

hfrick avatar Feb 17 '25 17:02 hfrick

Hey, @hfrick, thanks for responding.

Ideally, in this example, the number of components; however, the formula in add_model requires that you pass in explicit information. This is different from other tidy models where the tune_grid will account for this and update the formula.

In the examples provided, we need to have an explicit formula to specify the random effects for lmer.

This is my source of confusion. When you apply step_dumny or step_pca, the variables change, which breaks the reference required by add_model to pass in an explicit formula. Dummy variable creation in one thing in comparison to the number of principal components: the dummy variables will be created in the same way so you can account for it within the formula, the number of components will change based on the tuning parameters that tune_grid creates or you explicitly provide it.

So, in an instance such as this, do I need to forecast out the various formulas and essentially recreate what tune_grid would normally do?

I am using the dataset you are using as a reference, but I am currently trying to implement it within my research, which has protected data, so unfortunately, I can't share my git.

This is why I'm saying that if there is something that I'm missing, additional documentation would be helpful.

Thank you for your thoughts on this.

jkylearmstrong avatar Feb 21 '25 11:02 jkylearmstrong

Thanks for the clarification. You are very close!

The piece you're missing is to use the dot in the model formula to avoid having to specify the PCA columns directly. You combine the dot with the specification of the random effect. Since the dot essentially includes main effects of "everything else" other than the outcome, you'll also need to remove the fixed effect for the subjects which I asssume you don't want here. So what you land at is formula = depr_score ~ . + (1 | subject) - subject.

The other pieces you already had in place: a recipe so that you can set num_comp = tune() and specifying that model formula via add_model(formula).

library(tidymodels)
library(multilevelmod)
tidymodels_prefer()

set.seed(1)
riesby_split <- group_initial_split(riesby, group = subject)
riesby_train <- training(riesby_split)

folds <- group_vfold_cv(riesby_train, v = 5, group = subject)

rec <- recipe(depr_score ~ ., data = riesby_train) %>%
  add_role(subject, new_role = "exp_unit") %>%
  update_role(week, new_role = "id variable") %>%
  step_normalize(
    all_numeric_predictors(),
    -has_role("exp_unit")
  ) %>%
  step_pca(
    all_numeric_predictors(),
    -has_role("exp_unit"),
    num_comp = tune()
  )

lmer_spec <-
  linear_reg() %>%
    set_engine("lmer")

lmer_wflow <-
  workflow() %>%
    add_recipe(rec) %>%
    add_model(
      lmer_spec,
      formula = depr_score ~ . + (1 | subject) - subject
    )

grid_results <- tune_grid(
  lmer_wflow,
  resamples = folds,
  grid = 10,
  metrics = metric_set(rmse),
  control = control_grid(save_pred = TRUE, save_workflow = TRUE)
)

show_best(grid_results, metric = "rmse") 
#> # A tibble: 4 × 7
#>   num_comp .metric .estimator  mean     n std_err .config             
#>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1        4 rmse    standard    6.61     5   0.661 Preprocessor4_Model1
#> 2        2 rmse    standard    6.75     5   0.597 Preprocessor2_Model1
#> 3        3 rmse    standard    6.80     5   0.590 Preprocessor3_Model1
#> 4        1 rmse    standard    6.80     5   0.596 Preprocessor1_Model1

Created on 2025-02-22 with reprex v2.1.1

hfrick avatar Feb 24 '25 09:02 hfrick

Hey @hfrick thanks for the explanation, I'm not sure why I didn't think to try this out.

jkylearmstrong avatar Feb 26 '25 17:02 jkylearmstrong

Happy to help! I'll leave this issue open as a reminder to take a look at how we could improve the docs here, when we next pick up multilevelmod.

hfrick avatar Feb 27 '25 09:02 hfrick

Thanks @hfrick , I also think I added lmerTest::lmer as an option in a pull request. I might be missing something but it seemed easy enough to just follow what was done with lme4::lmer

jkylearmstrong avatar Mar 01 '25 05:03 jkylearmstrong