effectsize (Adjusted) Cohen's d for contrasts in linear models

Some code I wrote a while back for computing SMDs from lm output. Could be useful to report effect sizes in terms of standardized betas and Cohen's ds. Most important bit is the comment at the end.

.cmicalc <- function (df) {
  cmi <- ifelse(df <= 1, NA,
                exp(lgamma(df/2) - log(sqrt(df/2)) - lgamma((df - 1)/2))
                )
  return(cmi)
}

smd_stats <-
  function(diff = NULL,
           se_diff = NULL,
           df = NULL,
           m1 = NULL, m2 = NULL,
           n1 = NULL, n2 = NULL,
           sd1 = NULL, sd2 = NULL,
           sigma = NULL,
           mse = NULL,
           t_ncp = NULL,
           R2_cov = 0,
           q = 0,
           method = c("smd_pooledSD", "smd_controlSD", "smd_aveSD"),
           unbiased = FALSE,
           conf.int = TRUE,
           conf.level = 0.95,
           conf_method = c("t", "norm"),
           sd_method = c("unbiased", "ml"),
           std_method = c("unbiased", "ml"),
           se_var_method = c("ls", "unbiased"),
           ...) {
    method <- match.arg(method, choices = c("smd_pooledSD", "smd_controlSD", "smd_aveSD"))

    # Compute degrees of freedom
    if (is.null(df)) {
      if (is.null(n1) & is.null(n2)) {
        stop("One of these must be provided:\n  (1) `df`\n  (2) `n1`, `n2`, `q`")
      } else {
        df <- n1 + n2 - 2 - q
      }
    }
    # TODO: Add Satterwhite degrees of freedom

    # Compute standardizer
    if (all(is.null(mse),
            is.null(sigma),
            any(is.null(sd1),
                is.null(sd2),
                is.null(n1),
                is.null(n2)))
        ) {
      stop("One of these must be provided:\n  (1) `sigma`\n  (2)`mse`\n  (3) `sd1`, `sd2`, `n1`, `n2`.")
    } else if (is.null(mse)) {
      if (!is.null(sigma)) {
        mse <- sigma^2
      } else {
        sd_method <- match.arg(sd_method, choices = c("unbiased", "ml"))
        std_method <- match.arg(std_method, choices = c("unbiased", "ml"))
        if (method == "smd_pooledSD") {
          if (sd_method == "unbiased") {
            ssq1 <-  (n1 - 1) * sd1^2
            ssq2 <-  (n2 - 1) * sd2^2
          } else {
            ssq1 <-  n1 * sd1^2
            ssq2 <-  n2 * sd2^2
          }
          ssq <- ssq2 + ssq2
          mse <-
            ifelse(std_method == "unbiased",
                   ssq / df,
                   ssq / (n1 + n2)
            )
        } else if (method == "smd_controlSD") {
          if (sd_method == "unbiased") {
            ssq <-  (n1 - 1) * sd1^2
          } else {
            ssq <-  n1 * sd1^2
          }
          mse <-
            ifelse(std_method == "unbiased",
                   ssq / df,
                   ssq / (n1 + n2)
            )
        } else {
          if (sd_method == "unbiased") {
            mse <-
              ifelse(std_method == "unbiased",
                     (sd1^2 + sd2^2) / 2,
                     (sd1^2 * (n1 - 1) / n1 + sd2^2 * (n2 - 1) / n2) / 2
            )
          } else {
            mse <-
              ifelse(std_method == "unbiased",
                     (sd1^2 * n1 / (n1 - 1) + sd2^2 * n2 / (n2 - 1)) / 2,
                     (sd1^2 + sd2^2) / 2
              )
          }
        }
      }
    }

    # TODO: Implement large-sample vs unbiased sampling error variance
    if (is.null(diff)) {
      if (is.null(m1) & is.null(m2)) {
        stop("One of these must be provided:\n  (1) `diff`\n  (2) `m1`, `m2`")
      } else {
        diff <- m2 - m1
      }
    }

    # Adjust standardizer for covariates
    A <- (1 - R2_cov)
    mse <- mse / A

    # Compute d
    d <- diff / sqrt(mse)

    # Compute variance of d
    se_var <-
      switch(method,
             smd_pooledSD = A * (n1 + n2) / (n1 * n2) + d^2 / (2 * (n1 + n2)),
             smd_controlSD = A * (1/n1 + (sd2^2 / sd1^2) / n2 + d^2 / (2 * n1)),
             smd_aveSD = A * (d^2 / (8 * mse^2) * (sd1^4 / (n1 - 1) + sd2^4 / (n2 - 1)) + (sd1^2 / mse) / (n1 - 1) + (sd2^2 / mse) / (n2 - 1))
      )
    if (method != "smd_pooledSD" & R2_cov != 0) {
      warning(paste0("Covariate-adjusted 'se_var' is only approximately accurate when `method` is ", method, "."))
    }

    # Small-sample bias correction
    if (unbiased) {
      J <- .cmicalc(df)
    } else {
      J <- 1
    }

    # Compute confidence interval
    if (conf.int) {
      conf_method <- match.arg(conf_method, choices = c("t", "norm"))
      # TODO: Add alternate ci methods from Bonett 2008
      if (method != "smd_pooledSD" & conf_method == "t") {
        # TODO: Derive ncp-based CIs for glass and bonnett d
        conf_method <- "norm"
        message(paste0("`conf_method` == 't' not available when `method` == ", method, ".\n`conf_method` == 'norm' used instead."))
      }
      if (conf_method == "norm") {
        limits <- d + c(-1, 1) * qnorm((1 - conf.level) / 2, lower.tail = FALSE) * sqrt(se_var)
      } else {
        if (is.null(t_ncp)) {
          if (!is.null(se_diff)) {
            t_ncp <- diff / se_diff
          } else {
            # Approximate NCP if neither `t_ncp` nor `se_diff` are given
            n_eff <- (n1 * n2)/((n1 + n2) * A)
            t_ncp <- d * sqrt(n_eff)
          }
        }
        # TODO: Implement conf.limits.nct natively
        limits <- unlist(MBESS::conf.limits.nct(t_ncp, df, conf.level, ...)[c("Lower.Limit", "Upper.Limit")])
        if (!is.null(se_diff)) {
          limits <- limits * se_diff / sqrt(mse)
        } else {
          limits <- limits / sqrt(n_eff)
        }
      }
      names(limits) <- c("lower", "upper")
    }

    # Correct for small-sample bias
    d <- d * J
    se_var <- se_var * J^2
    se <- sqrt(se_var)
    if (conf.int) {
      limits <- limits * J
      out <- list(es = d, df = df, se = se, se_var = se_var, ci.lb = limits['lower'], ci.ub = limits['upper'])
    } else {
      out <- list(es = d, df = df, se = se, se_var = se_var)
    }
    out <- lapply(out, unname)
    class(out) <- c("es_smd", "es", "list")
    attr(out, "es") <- "SMD"
    attr(out, "method") <- method
    attr(out, "unbiased") <- unbiased
    attr(out, "conf.level") <- conf.level
    return(out)
  }

print.es <- function(x, digits = 3, ...) {
  if (attr(x, "es") == "SMD") {
    if (attr(x, "method") == "smd_pooledSD") {
      if (attr(x, "unbiased") == TRUE) {
        cat("Standardized mean difference (Cohen's d)\n")
      } else {
        cat("Standardized mean difference (Hedges' g)\n")
      }
      cat("========================================")
    } else if (attr(x, "method") == "smd_controlSD") {
      if (attr(x, "unbiased") == TRUE) {
        cat("Standardized mean difference (Glass' \u0394 [unbiased])\n")
        cat("=======================================================")
      } else {
        cat("Standardized mean difference (Glass' \u0394)\n")
        cat("============================================")
      }
    } else {
      if (attr(x, "unbiased") == TRUE) {
        cat("Standardized mean difference (Bonett's d [unbiased])\n")
        cat("=======================================================")
      } else {
        cat("Standardized mean difference (Bonett's d)\n")
        cat("============================================")
      }
    }
    cat("\n\n")
    df <- as.data.frame(x)
    print.data.frame(df, digits = digits, row.names = FALSE)
  }
  invisible(x)
}

mod <- lm(mpg ~ factor(cyl, levels = c(4, 6, 8)) + hp, data = mtcars)
r2_cov <- summary(lm(mpg ~ hp, data = mtcars))$r.squared

print.es(
  smd_stats(diff = summary(mod)$coef[2,1],
            se_diff = summary(mod)$coef[2,2],
            t_ncp =  summary(mod)$coef[2,3],
            df = mod$df.residual,
            sigma = summary(mod)$sigma,
            R2_cov = r2_cov,
            n1 = table(model.frame(mod)[,2])[1],
            n2 = table(model.frame(mod)[,2])[2],
            conf.int = TRUE
  )
)

# TODO: Add smd.lm() method to automatically extract factors and compute d values.

# Formula to use with lm output
# d_adj <- t * se_b / sigma * sqrt(1 - R2_cov)

Jun 15 '21 12:06 bwiernik

Ok, some Qs:

Is this only relevant for fixed effect only models (aka basic OLS models)?
Do you envision this working where a model as input, and a Cohen's d for each coef? I. Only for factors? II. Or is this applicable also for covs?

Jun 16 '21 08:06 mattansb

It could generally apply to mixed effects models. Would just need to think about what variances to include in the denominator.
Yes, the idea would be to take a model and return cohen's d values for the factor variables, potentially adjusted for other factors or continuous covariates. Ideally, an option for Cohen's d for a specific contrast would be nice, but by default computing d for each level versus the reference level seems good.

Compared to the current functions to compute standardized regression coefficients for continuous covariates but leaving factors alone (which yields a d-like metric for the factor predictors), this would yield proper Cohen's d values by removing the between-groups variance from the response SD used to standardize.

Jun 16 '21 12:06 bwiernik

(could in be loosely related to... _{*pulling it out from under the rug*} this? As in that it could be used as some of standardized index of effect size 😅)

Jun 16 '21 12:06 DominiqueMakowski

@bwiernik If I understand correctly, this is similar to what emmeans::eff_size() is trying to do, only for coefs instead of estimates / contrasts?

mod <- lm(mpg ~ factor(cyl, levels = c(4, 6, 8)) + hp, data = mtcars)
r2_cov <- summary(lm(mpg ~ hp, data = mtcars))$r.squared

print.es(
  smd_stats(diff = summary(mod)$coef[2,1],
            se_diff = summary(mod)$coef[2,2],
            t_ncp =  summary(mod)$coef[2,3],
            df = mod$df.residual,
            sigma = summary(mod)$sigma,
            R2_cov = r2_cov,
            n1 = table(model.frame(mod)[,2])[1],
            n2 = table(model.frame(mod)[,2])[2],
            conf.int = TRUE
  )
)
#> Standardized mean difference (Hedges' g)
#> ========================================
#> 
#>    es df    se se_var ci.lb  ci.ub
#>  -1.2 28 0.364  0.133  -1.9 -0.472
 
emmeans::emmeans(mod, ~cyl) |>
  emmeans::eff_size(sigma = sigma(mod), edf = df.residual(mod))
#>  contrast effect.size    SE df lower.CL upper.CL
#>  4 - 6          1.897 0.579 28    0.710     3.08
#>  4 - 8          2.708 0.823 28    1.022     4.39
#>  6 - 8          0.812 0.638 28   -0.496     2.12
#> 
#> sigma used for effect sizes: 3.146 
#> Confidence level used: 0.95

Where would this go?

I think this can either be implemented in standardize_parameters(..., method = "d") ("smart", if you can figure out what Dom is talking about... ^^) or in cohens_d.lm().

What do you think?

Jun 16 '21 12:06 mattansb

It is similar to emmeans::eff_size(), but differs in the computing of the denominator.

There are three broad approaches to computing a d value from a linear model:

Standardize using the raw response SD. This is what standardize_parameters() currently does. It will tend to underestimate d.
Standardize using the MSE (sigma). This is what emmeans::eff_size() does. This works when the contrasts are the only predictors in the model, but not when there are covariates. The response variance accounted for by the covariates should not be removed from the SD used to standardize. Otherwise, d will be overestimated as shown above. I can pull some references from Larry Hedges about this.
Standardize using the response SD with only the between-groups variance on the focal factor/contrast removed. That is what this function does. This allows for groups to be equated on their covariates, but creates an appropriate scale for standardizing the response.

We should write this function to handle arbitrary contrasts like emmeans, but standardize using the correct SD. This should work with arbitrary contrasts I think: d_adj <- t * se_b / sigma * sqrt(1 - R2_cov).

I think cohens_d.lm() makes the most sense. I think that is where people might look for it. We could also add standardize_parameters(..., method = "d") which redirects to this?

Jun 16 '21 14:06 bwiernik

(Not sure I've ever seen opt1 called a d-value 👀)

If I understand correctly: 2. is a conditional d-value: it is the expected d within a sub population (say for cars with hp=3, what is the d) 3. is a marginal d-value.

Correct?

I think cohens_d.lm() makes the most sense. I think that is where people might look for it. We could also add standardize_parameters(..., method = "d") which redirects to this?

Sounds good, but think of expandability - if we add more supported models, we might need a cohens_d method for each? (contrast that with what standardize_parameters does, which has a single default method for most models).

Jun 17 '21 06:06 mattansb

Sure, conditional/marginal is a good way to describe it. But given that the choice of standardizer is primarily about interpretability, I'm not sure that the conditional SD is very useful in most cases. There is some discussion here See the formulas on page 229 and 230 here: https://books.google.be/books?hl=en&lr=&id=LUGd6B9eyc4C&oi=fnd&pg=PA229#v=onepage&q&f=false

Jun 17 '21 08:06 bwiernik

Sounds good, but think of expandability - if we add more supported models, we might need a cohens_d method for each? (contrast that with what standardize_parameters does, which has a single default method for most models).

And so what do you think?

Jun 17 '21 08:06 bwiernik

It's not pretty, but...

cohens_d <- function(x, ...) {
  if (insight::is_model(x) && inherits(x, <models that this function should work with>)) {
    .cohens_d_models(x, ...)
  } else {
    .cohens_d(x, ...)
  }
}

I'm open to any other ideas (:

I invite you to build the skeleton of the .cohens_d_models() func - we can figure names out later (:

Jun 17 '21 14:06 mattansb

Would it be possible to extend this brilliant idea also to more generalized models, like the MMRM fit via GLS (2 implementations in R: nlme::gls() and mmrm:mmrm()) and GEE (geeglm and maybe geeM, geesmv)?

The MMRM (rather than mixed models) make the core tool in clinical trials, it would be great to have the effect sizes automated to use appropriate part of the estimated variance-covariance matrix at the given contrasts.

Apr 07 '23 12:04 Generalized

effectsize effectsize copied to clipboard

(Adjusted) Cohen's d for contrasts in linear models

Where would this go?

effectsize
effectsize copied to clipboard