broom
broom copied to clipboard
P-value incorrect when using ddf argument in tidy.svyglm()
The problem
The p-value reported by tidy.svyglm()
does not match the confidence interval when the ddf
argument is provided (and is different from the default denominator degrees of freedom, which uses the df.residual
of the svyglm
object).
A quick search revealed that the ddf
argument was implemented for the confidence interval following issue #839 with commit https://github.com/tidymodels/broom/commit/b7f690a056867cda5625f813b7e7418b4b85a7e5, but the current development version still seems to need it implemented for the p-value here: https://github.com/tidymodels/broom/blob/5d77eb743b3be3eca2df3c1228664925c4a481ce/R/survey-tidiers.R#L65
Since summary.svyglm()
uses df.resid
for the argument name and confint.svyglm()
uses ddf
for the argument name, it may require a dedicated argument in the tidy.svyglm()
function.
Reproducible example
library(broom)
#> Warning: package 'broom' was built under R version 4.1.3
library(survey)
#> Loading required package: grid
#> Loading required package: Matrix
#> Loading required package: survival
#>
#> Attaching package: 'survey'
#> The following object is masked from 'package:graphics':
#>
#> dotchart
data(api)
dstrat <- svydesign(id = ~ 1,
strata = ~ stype,
weights = ~ pw,
data = apistrat,
fpc = ~fpc)
model <- svyglm(sch.wide ~ yr.rnd, design = dstrat, family = binomial)
#> Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Default ddf
## confint.svyglm(., ddf = NULL)
## summary.svyglm(., df.resid = NULL)
tidy(model,
conf.int = TRUE,
exponentiate = TRUE)
#> # A tibble: 2 x 7
#> term estimate std.error statistic p.value conf.low conf.high
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 4.42 0.179 8.28 1.89e-14 3.10 6.29
#> 2 yr.rndYes 2.19 0.702 1.12 2.64e- 1 0.550 8.75
## Same p-value in summary.svyglm
summary(model)
#>
#> Call:
#> svyglm(formula = sch.wide ~ yr.rnd, design = dstrat, family = binomial)
#>
#> Survey design:
#> svydesign(id = ~1, strata = ~stype, weights = ~pw, data = apistrat,
#> fpc = ~fpc)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 1.4857 0.1794 8.280 1.89e-14 ***
#> yr.rndYes 0.7853 0.7017 1.119 0.264
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1.005025)
#>
#> Number of Fisher Scoring iterations: 4
## Changed ddf (but no change in p-value)
## confint.svyglm(., ddf = degf(dstrat))
## summary.svyglm(., df.resid = NULL)
tidy(model,
conf.int = TRUE,
exponentiate = TRUE,
ddf = 5) ## extreme ddf to exaggerate difference
#> # A tibble: 2 x 7
#> term estimate std.error statistic p.value conf.low conf.high
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 4.42 0.179 8.28 1.89e-14 2.79 7.01
#> 2 yr.rndYes 2.19 0.702 1.12 2.64e- 1 0.361 13.3
## Correct p-value for ddf = 5 (df.resid = 5)
summary(model, df.resid = 5)
#>
#> Call:
#> svyglm(formula = sch.wide ~ yr.rnd, design = dstrat, family = binomial)
#>
#> Survey design:
#> svydesign(id = ~1, strata = ~stype, weights = ~pw, data = apistrat,
#> fpc = ~fpc)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 1.4857 0.1794 8.280 0.000419 ***
#> yr.rndYes 0.7853 0.7017 1.119 0.313952
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1.005025)
#>
#> Number of Fisher Scoring iterations: 4
Created on 2022-04-07 by the reprex package (v2.0.1)
Session info
sessioninfo::session_info()
#> - Session info ---------------------------------------------------------------
#> setting value
#> version R version 4.1.2 (2021-11-01)
#> os Windows 10 x64 (build 19044)
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.1252
#> ctype English_United States.1252
#> tz America/Denver
#> date 2022-04-07
#> pandoc 2.17.1.1 @ C:/Program Files/RStudio/bin/quarto/bin/ (via rmarkdown)
#>
#> - Packages -------------------------------------------------------------------
#> package * version date (UTC) lib source
#> assertthat 0.2.1 2019-03-21 [2] CRAN (R 4.1.2)
#> backports 1.4.1 2021-12-13 [1] CRAN (R 4.1.2)
#> broom * 0.7.12 2022-01-28 [1] CRAN (R 4.1.3)
#> cli 3.1.0 2021-10-27 [1] CRAN (R 4.1.2)
#> crayon 1.4.2 2021-10-29 [1] CRAN (R 4.1.2)
#> DBI 1.1.1 2021-01-15 [2] CRAN (R 4.1.2)
#> digest 0.6.29 2021-12-01 [1] CRAN (R 4.1.2)
#> dplyr 1.0.7 2021-06-18 [1] CRAN (R 4.1.2)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.2)
#> evaluate 0.14 2019-05-28 [2] CRAN (R 4.1.2)
#> fansi 0.5.0 2021-05-25 [1] CRAN (R 4.1.2)
#> fastmap 1.1.0 2021-01-25 [2] CRAN (R 4.1.2)
#> fs 1.5.2 2021-12-08 [2] CRAN (R 4.1.2)
#> generics 0.1.1 2021-10-25 [1] CRAN (R 4.1.2)
#> glue 1.6.0 2021-12-17 [1] CRAN (R 4.1.2)
#> highr 0.9 2021-04-16 [2] CRAN (R 4.1.2)
#> htmltools 0.5.2 2021-08-25 [2] CRAN (R 4.1.2)
#> knitr 1.37 2021-12-16 [1] CRAN (R 4.1.2)
#> lattice 0.20-45 2021-09-22 [1] CRAN (R 4.1.2)
#> lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.1.2)
#> magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.1.2)
#> Matrix * 1.4-0 2021-12-08 [1] CRAN (R 4.1.2)
#> mitools 2.4 2019-04-26 [1] CRAN (R 4.1.2)
#> pillar 1.6.4 2021-10-18 [1] CRAN (R 4.1.2)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.1.2)
#> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.1.2)
#> R.cache 0.15.0 2021-04-30 [2] CRAN (R 4.1.3)
#> R.methodsS3 1.8.1 2020-08-26 [2] CRAN (R 4.1.1)
#> R.oo 1.24.0 2020-08-26 [2] CRAN (R 4.1.1)
#> R.utils 2.11.0 2021-09-26 [2] CRAN (R 4.1.3)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.1.2)
#> reprex 2.0.1 2021-08-05 [2] CRAN (R 4.1.2)
#> rlang 0.4.12 2021-10-18 [1] CRAN (R 4.1.2)
#> rmarkdown 2.11 2021-09-14 [2] CRAN (R 4.1.2)
#> rstudioapi 0.13 2020-11-12 [2] CRAN (R 4.1.2)
#> sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.1.2)
#> stringi 1.7.6 2021-11-29 [1] CRAN (R 4.1.2)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.1.2)
#> styler 1.7.0 2022-03-13 [2] CRAN (R 4.1.3)
#> survey * 4.1-1 2021-07-19 [1] CRAN (R 4.1.2)
#> survival * 3.2-13 2021-08-24 [2] CRAN (R 4.1.2)
#> tibble 3.1.6 2021-11-07 [1] CRAN (R 4.1.2)
#> tidyr 1.1.4 2021-09-27 [1] CRAN (R 4.1.2)
#> tidyselect 1.1.1 2021-04-30 [1] CRAN (R 4.1.2)
#> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.1.2)
#> vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.1.2)
#> withr 2.4.3 2021-11-30 [1] CRAN (R 4.1.2)
#> xfun 0.29 2021-12-14 [2] CRAN (R 4.1.2)
#> yaml 2.2.1 2020-02-01 [2] CRAN (R 4.1.1)
#>
#> [1] C:/..../renv/library/R-4.1/x86_64-w64-mingw32
#> [2] C:/.../R/R-4.1.2/library
#>
#> ------------------------------------------------------------------------------
Here is a potential fix to this issue. If you would like, I can open a pull request to add this in.
library(broom)
#> Warning: package 'broom' was built under R version 4.1.3
library(survey)
#> Loading required package: grid
#> Loading required package: Matrix
#> Loading required package: survival
#>
#> Attaching package: 'survey'
#> The following object is masked from 'package:graphics':
#>
#> dotchart
data(api)
dstrat <- svydesign(id = ~ 1,
strata = ~ stype,
weights = ~ pw,
data = apistrat,
fpc = ~fpc)
model <- svyglm(sch.wide ~ yr.rnd, design = dstrat, family = binomial)
#> Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Changed ddf (but no change in p-value)
tidy(model,
conf.int = TRUE,
exponentiate = TRUE,
ddf = 5)
#> # A tibble: 2 x 7
#> term estimate std.error statistic p.value conf.low conf.high
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 4.42 0.179 8.28 1.89e-14 2.79 7.01
#> 2 yr.rndYes 2.19 0.702 1.12 2.64e- 1 0.361 13.3
## Correct p-value for ddf = 5 (df.resid = 5)
summary(model, df.resid = 5)
#>
#> Call:
#> svyglm(formula = sch.wide ~ yr.rnd, design = dstrat, family = binomial)
#>
#> Survey design:
#> svydesign(id = ~1, strata = ~stype, weights = ~pw, data = apistrat,
#> fpc = ~fpc)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 1.4857 0.1794 8.280 0.000419 ***
#> yr.rndYes 0.7853 0.7017 1.119 0.313952
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1.005025)
#>
#> Number of Fisher Scoring iterations: 4
## Updated function
tidy_svyglm <- function(x, conf.int = FALSE, conf.level = 0.95,
exponentiate = FALSE, ddf = NULL, ...) {
ret <- as_tibble(summary(x, df.resid = ddf)$coefficients, rownames = "term")
colnames(ret) <- c("term", "estimate", "std.error", "statistic", "p.value")
coefs <- tibble::enframe(stats::coef(x), name = "term", value = "estimate")
ret <- left_join(coefs, ret, by = c("term", "estimate"))
if (conf.int) {
ci <- broom_confint_terms(x, level = conf.level, ddf = ddf, ...)
ret <- left_join(ret, ci, by = "term")
}
if (exponentiate) {
ret <- exponentiate(ret)
}
ret
}
environment(tidy_svyglm) <- asNamespace("broom")
assignInNamespace("tidy.svyglm",
value = tidy_svyglm,
ns = "broom")
## Correct p-value for ddf = 5 (df.resid = 5)
tidy(model,
conf.int = TRUE,
exponentiate = TRUE,
ddf = 5)
#> # A tibble: 2 x 7
#> term estimate std.error statistic p.value conf.low conf.high
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 4.42 0.179 8.28 0.000419 2.79 7.01
#> 2 yr.rndYes 2.19 0.702 1.12 0.314 0.361 13.3
Created on 2022-05-03 by the reprex package (v2.0.1)
This issue has been automatically closed due to inactivity.
This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.