recipes icon indicating copy to clipboard operation
recipes copied to clipboard

`step_bs()` with fixed knots not matching `splines::bs()` output

Open andeek opened this issue 3 years ago • 1 comments
trafficstars

The problem

I'm having trouble with fitting a cubic spline with fixed knots. I would expect the number of terms in the fitted model to be equal to degree + 1 + (# knots), however, in the fitted model I am only seeing degree + 1 terms. This makes me think that the knots that are passed through the options list are not being used. This is confirmed by fitting a regression spline with deg_free = degree and getting the same fitted results.

Any insight would be greatly appreciated. I have not been able to find an example with step_bs and fixed knots.

Reproducible example

library(ISLR)
library(splines)
library(tidymodels)

lm_spec <- linear_reg()

## fixed knots
bs_fixed_rec <- recipe(wage ~ age, data = Wage) |>
  step_bs(age, degree = 4, options = list(knots = c(25, 40, 60)))

bs_fixed_wf <- workflow() |>
  add_model(lm_spec) |>
  add_recipe(bs_fixed_rec)

bs_fixed_fit <- bs_fixed_wf |>
  fit(data = Wage)

bs_fixed_fit |> tidy()
#> # A tibble: 5 × 5
#>   term        estimate std.error statistic  p.value
#>   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
#> 1 (Intercept)     51.9      5.30      9.80 2.39e-22
#> 2 age_bs_1       106.      17.0       6.21 6.19e-10
#> 3 age_bs_2        45.9     16.0       2.86 4.21e- 3
#> 4 age_bs_3        88.9     22.0       4.03 5.60e- 5
#> 5 age_bs_4        29.6     13.4       2.21 2.69e- 2

## matches df = degree
bs_rec <- recipe(wage ~ age, data = Wage) |>
  step_bs(age, degree = 4, deg_free = 4)

bs_wf <- workflow() |>
  add_model(lm_spec) |>
  add_recipe(bs_rec)

bs_fit <- bs_wf |>
  fit(data = Wage)

bs_fit |> tidy()
#> # A tibble: 5 × 5
#>   term        estimate std.error statistic  p.value
#>   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
#> 1 (Intercept)     51.9      5.30      9.80 2.39e-22
#> 2 age_bs_1       106.      17.0       6.21 6.19e-10
#> 3 age_bs_2        45.9     16.0       2.86 4.21e- 3
#> 4 age_bs_3        88.9     22.0       4.03 5.60e- 5
#> 5 age_bs_4        29.6     13.4       2.21 2.69e- 2

## vs lm call
lm(wage ~ splines::bs(age, degree = 3, knots = c(25, 40, 60)), data = Wage)
#> 
#> Call:
#> lm(formula = wage ~ splines::bs(age, degree = 3, knots = c(25, 
#>     40, 60)), data = Wage)
#> 
#> Coefficients:
#>                                          (Intercept)  
#>                                                60.49  
#> splines::bs(age, degree = 3, knots = c(25, 40, 60))1  
#>                                                 3.98  
#> splines::bs(age, degree = 3, knots = c(25, 40, 60))2  
#>                                                44.63  
#> splines::bs(age, degree = 3, knots = c(25, 40, 60))3  
#>                                                62.84  
#> splines::bs(age, degree = 3, knots = c(25, 40, 60))4  
#>                                                55.99  
#> splines::bs(age, degree = 3, knots = c(25, 40, 60))5  
#>                                                50.69  
#> splines::bs(age, degree = 3, knots = c(25, 40, 60))6  
#>                                                16.61

Created on 2022-10-26 with reprex v2.0.2

andeek avatar Oct 26 '22 21:10 andeek

Thank you, this appears to be a bug!

EmilHvitfeldt avatar Mar 30 '23 21:03 EmilHvitfeldt

This issue has been fixed in https://github.com/tidymodels/recipes/pull/1338

library(ISLR)
library(splines)
library(tidymodels)

lm_spec <- linear_reg()

## fixed knots
bs_fixed_rec <- recipe(wage ~ age, data = Wage) |>
  step_bs(age, degree = 4, options = list(knots = c(25, 40, 60)))

bs_fixed_wf <- workflow() |>
  add_model(lm_spec) |>
  add_recipe(bs_fixed_rec)

bs_fixed_fit <- bs_fixed_wf |>
  fit(data = Wage)

bs_fixed_fit |> tidy()
#> # A tibble: 8 × 5
#>   term        estimate std.error statistic      p.value
#>   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
#> 1 (Intercept)    59.2       10.4     5.70  0.0000000134
#> 2 age_bs_1        7.45      15.0     0.497 0.619       
#> 3 age_bs_2       26.9       11.2     2.41  0.0162      
#> 4 age_bs_3       74.9       13.9     5.39  0.0000000754
#> 5 age_bs_4       42.8       13.8     3.10  0.00199     
#> 6 age_bs_5       78.5       16.7     4.71  0.00000253  
#> 7 age_bs_6       21.7       18.7     1.16  0.246       
#> 8 age_bs_7       30.5       21.4     1.43  0.154

lm(wage ~ splines::bs(age, degree = 4, knots = c(25, 40, 60)), data = Wage) %>%
  tidy()
#> # A tibble: 8 × 5
#>   term                                      estimate std.error statistic p.value
#>   <chr>                                        <dbl>     <dbl>     <dbl>   <dbl>
#> 1 (Intercept)                                  59.2       10.4     5.70  1.34e-8
#> 2 splines::bs(age, degree = 4, knots = c(2…     7.45      15.0     0.497 6.19e-1
#> 3 splines::bs(age, degree = 4, knots = c(2…    26.9       11.2     2.41  1.62e-2
#> 4 splines::bs(age, degree = 4, knots = c(2…    74.9       13.9     5.39  7.54e-8
#> 5 splines::bs(age, degree = 4, knots = c(2…    42.8       13.8     3.10  1.99e-3
#> 6 splines::bs(age, degree = 4, knots = c(2…    78.5       16.7     4.71  2.53e-6
#> 7 splines::bs(age, degree = 4, knots = c(2…    21.7       18.7     1.16  2.46e-1
#> 8 splines::bs(age, degree = 4, knots = c(2…    30.5       21.4     1.43  1.54e-1

Created on 2024-06-07 with reprex v2.1.0

EmilHvitfeldt avatar Jun 07 '24 22:06 EmilHvitfeldt

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex https://reprex.tidyverse.org) and link to this issue.

github-actions[bot] avatar Jun 22 '24 00:06 github-actions[bot]