ciTools icon indicating copy to clipboard operation
ciTools copied to clipboard

Implement a parametric method for GLM PIs

Open jthaman opened this issue 6 years ago • 2 comments

Currently we simulate from the model using a parametric bootstrap. We showed in the GLM vignette that this is valid if the model is "true", and the simulation is still relatively fast, but a parametric method would be more pleasing to me.

I'm not sure if there is any literature available on this.

jthaman avatar Feb 23 '18 16:02 jthaman

We've been thinking about this recently and are considering using one of two approaches:

  1. Ignore uncertainty in the fitted parameters and use the quantiles of the fitted distribution to create a prediction interval.
  2. Calculate the confidence intervals and then use these with the fitted distribution to get upper and lower bounds of the prediction intervals (following the approach https://fromthebottomoftheheap.net/2017/05/01/glm-prediction-intervals-i/ and https://fromthebottomoftheheap.net/2017/05/01/glm-prediction-intervals-ii/)

I think these two approaches should, in the limit of the simulations, bound your simulation approach. I've implemented these roughly in the somewhat long winded example below and they seem to give reasonable results. It would be interesting to get your feedback and whether this approach would be useful for ciTools to incorporate:

library(MASS)
library(ggplot2)
library(ciTools)
#> ciTools version 0.5.2 (C) Institute for Defense Analyses
library(dplyr)
library(tibble)
library(patchwork)

#  ci function from ciTools (pretty much anyway)
add_ci_glm <- function(dat, fit, alpha){
  out <- predict(fit, dat, se.fit = TRUE, type = "link")

  if (fit$family$family %in% c("binomial", "poisson"))
    crit_val <- qnorm(p = 1 - alpha/2, mean = 0, sd = 1)
  else
    crit_val <- qt(p = 1 - alpha/2, df = fit$df.residual)

  inverselink <- fit$family$linkinv
  pred <- inverselink(out$fit)
  upper <- inverselink(out$fit + crit_val *  out$se.fit)
  lower <- inverselink(out$fit - crit_val * out$se.fit)

  if (fit$family$link %in% c("inverse", "1/mu^2")) {
      ## need to do something like this for any decreasing link
      ## function.
      upr <- lower
      lower <- upper
      upper <- upr
  }

  dat$pred <- pred
  dat$`upper-ci` <- upper
  dat$`lower-ci` <- lower
  as_tibble(dat)
}

add_intervals_glm <- function(dat, fit, alpha = 0.05, uncertain = TRUE) {

  # confidence intervals
  dat <- add_ci_glm(dat, fit, alpha)
  
  # prediction intervals
  fam <- family(fit)$family

  if (uncertain) {
    lower <- dat$`lower-ci`
    upper <- dat$`upper-ci`
  } else {
    lower <- dat$pred
    upper <- dat$pred
  }

  if (inherits(fit, "negbin")) {
    theta <- fit$theta
    setheta <- fit$SE.theta
    dat$`lower-pi` = qnbinom(alpha / 2, mu = lower, size = theta)
    dat$`upper-pi` <- qnbinom(1 - alpha / 2, mu = upper, size = theta)
  } else if (fam == "poisson") {
    dat$`lower-pi` <- qpois(alpha / 2, lambda = lower)
    dat$`upper-pi` <- qpois(1 - alpha / 2, lambda = upper)
  } else if (fam == "quasipoisson") {
    overdispersion <- summary(fit)$dispersion
    dat$`lower-pi` <- qnbinom(alpha / 2, mu = lower, size = lower / (overdispersion - 1))
    dat$`upper-pi` <- qnbinom(1 - alpha / 2, mu = upper, size = upper / (overdispersion - 1))
  } else if (fam == "gamma") {
    overdispersion <- summary(fit)$dispersion
    dat$`lower-pi` <- qgamma(alpha / 2, shape = 1 / overdispersion, rate = 1 / (lower * overdispersion))
    dat$`upper-pi` <- qgamma(1 - alpha / 2, shape = 1 / overdispersion, rate = 1 / (upper * overdispersion))
  } else if (fam == "binomial") {
    dat$`lower-pi` <- qbinom(alpha / 2, size = fit$prior.weights, prob = lower / fit$prior.weights)
    dat$`upper-pi` <- qbinom(1 - alpha / 2, size = fit$prior.weights, prob = upper / fit$prior.weights)
  } else if (fam == "gaussian") {
    sigma_sq <- summary(fit)$dispersion
    se_terms <- pred$se.fit
    t_quant <- qt(p = alpha / 2, df = fit$df.residual, lower.tail = FALSE)
    se_global <- sqrt(sigma_sq + se_terms^2)
    dat$`lower-pi` <- dat$fitted - t_quant * se_global
    dat$`upper-pi` <- dat$fitted + t_quant * se_global
  } else {
    stop("Unsupported glm family type")
  }
  as_tibble(dat)
}

# poisson -----------------------------------------------------------------
x <- rnorm(100, mean = 0)
y <- rpois(n = 100, lambda = exp(1.5 + 0.5*x))
dat <- data.frame(x = x, y = y)
fit <- glm(y ~ x , family = poisson(link = "log"))

dat1 <-
  dat %>%
  add_ci(fit, names = c("lwr", "upr"), alpha = 0.1) %>%
  add_pi(fit, names = c("lpb", "upb"), alpha = 0.1, nSims = 20000)
#> Warning in add_pi.glm(., fit, names = c("lpb", "upb"), alpha = 0.1, nSims =
#> 20000): The response is not continuous, so Prediction Intervals are approximate

dat2 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = FALSE)

dat3 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = TRUE)

p1 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.2) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat2, alpha = 0.4) +
  ggtitle("Poisson Regression with prediction intervals and no uncertainty in parameters", 
          subtitle = "Model fit (black line), with bootstrap intervals (gray), parametric intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 30))

p2 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.4) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat3, alpha = 0.2) +
  ggtitle("Poisson Regression with prediction intervals and uncertainty in parameters", 
          subtitle = "Model fit (black line), with parametric intervals (gray), bootstrap intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 30))

p1 / p2


# quasipoisson ------------------------------------------------------------
x <- runif(n = 100, min = 0, max = 2)
mu <- exp(1 + x)
y <- rnegbin(n = 100, mu = mu, theta = mu/(5 - 1))
dat <- data.frame(x = x, y = y)
fit <- glm(y ~ x, family = quasipoisson(link = "log"))

dat1 <-
  dat %>%
  add_ci(fit, names = c("lwr", "upr"), alpha = 0.1) %>%
  add_pi(fit, names = c("lpb", "upb"), alpha = 0.1, nSims = 20000)
#> Warning in add_pi.glm(., fit, names = c("lpb", "upb"), alpha = 0.1, nSims =
#> 20000): The response is not continuous, so Prediction Intervals are approximate

dat2 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = FALSE)

dat3 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = TRUE)

p1 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.2) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat2, alpha = 0.4) +
  ggtitle("Quasipoisson Regression with prediction intervals and no uncertainty in parameters", 
          subtitle = "Model fit (black line), with bootstrap intervals (gray), parametric intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 40))

p2 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.4) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat3, alpha = 0.2) +
  ggtitle("Quasioisson Regression with prediction intervals and uncertainty in parameters", 
          subtitle = "Model fit (black line), with parametric intervals (gray), bootstrap intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 40))

p1 / p2

Created on 2020-08-25 by the reprex package (v0.3.0)

TimTaylor avatar Aug 25 '20 19:08 TimTaylor

Tim, in our glm vignette, https://cran.r-project.org/web/packages/ciTools/vignettes/ciTools-glm-vignette.html, we showed in a simulation that our Poisson PIs are conservative (at least in the examples we tried). I would hesitate to add a method to ciTools that I know is more conservative than what's already available. For other GLMs, the current parametric bootstrap may be conservative or anticonservative depending on model/sample size.

I do think you are correct about your two methods bounding the current ciTools approach.

Personally, I find add_pi.glm to be pretty fast, at least compared to many other bootstrap methods in ciTools.

It would be interesting to put a sim together that demonstrates the coverage probs on these techniques compared to the one available (And also compared to a Bayesian prediction interval with uninformative priors).

jthaman avatar Sep 22 '20 15:09 jthaman