rollRegres
rollRegres copied to clipboard
weighted roll_regres
would you consider adding support for weights like lm does?
i believe that lm uses a weighted least squares. i.e. minimizing sum(w*e^2), when given an input vector of weights
This is a good idea. For the time being, you can:
- Scale the design matrix and the outcomes.
- Subsequently down scale the predicted values
This will give you the same results (except for the R squared). Here is an example:
#####
# simulate data
set.seed(65731482)
n <- 100
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)
w <- runif(n, 0, 2) # the weights
#####
# use package function
w_sqrt <- sqrt(w)
pck_out <- roll_regres.fit(
w_sqrt * X, w_sqrt * y, width = 50L, do_downdates = FALSE,
do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))
# re-scale predicted values
pck_out$one_step_forecasts <- pck_out$one_step_forecasts / w_sqrt
#####
# assign R version
R_func <- function(X, y, w, width){
n <- nrow(X)
p <- ncol(X)
out <- matrix(NA_real_, n, p)
sigmas <- rep(NA_real_, n)
r.squared <- rep(NA_real_, n)
one_step_forecasts <- rep(NA_real_, n)
for(i in width:n){
idx <- 1:i
fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE], weights = w[idx])
out[i, ] <- fit$coefficients
su <- summary(fit)
sigmas[i] <- su$sigma
ss1 <- sum(w[idx] * (y[idx] - weighted.mean(y[idx], w[idx]))^2)
ss2 <- sum(w[idx] * fit$residuals^2)
r.squared[i] <- 1 - ss2 / ss1
if(i < n){
next_i <- i + 1L
one_step_forecasts[next_i] <- fit$coefficients %*% X[next_i, ]
}
}
list(coef = out, sigmas = sigmas, r.squared = r.squared,
one_step_forecasts = one_step_forecasts)
}
R_out <- R_func(X, y, w, 50L)
#####
# gives the same
stopifnot(
isTRUE(all.equal(R_out$coef , pck_out$coefs)),
isTRUE(all.equal(R_out$sigmas , pck_out$sigmas)),
isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))
# the R squared is off though
isTRUE(all.equal(R_out$r.squared , pck_out$r.squared))
#R> [1] FALSE
thank you @boennecd. that is a very clever approach. Since i generate my own design matrix already, this was very simple to implement. unfortunately for my scenario, I rely very heavily on the r.squared value for subsequent processing. I looked at the output values of r.squared after scaling the matrix, but they aren't usable for my purposes, even as an approximation. I also couldn't think of a simple way to compute them without reproducing the residuals and the weighted mean, which is quite costly in R, for expanding windows.
Thanks again for considering this.
Thank you for the reply. Here is a version to the R squared:
#####
# simulate data
set.seed(65731482)
n <- 100
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)
w <- runif(n, 0, 2) # the weights
width <- 50L
#####
# use package function
w_sqrt <- sqrt(w)
w_sqrt_y <- w_sqrt * y
pck_out <- roll_regres.fit(
w_sqrt * X, w_sqrt_y, width = width, do_downdates = FALSE,
do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))
# re-scale predicted values
pck_out$one_step_forecasts <- pck_out$one_step_forecasts / w_sqrt
# re-scale the R^2
library(zoo)
mea_weighted <- cumsum(w * y) / cumsum(w)
mea_unweighted <- cumsum(w_sqrt_y) / 1:n
fac <- sapply(width:n, function(i) {
idx <- 1:i
sum((w_sqrt_y[idx] - mea_unweighted[i])^2) /
sum(w[idx] * (y[idx] - mea_weighted[i])^2)
})
old_fraction <- -(pck_out$r.squareds[width:n] - 1)
pck_out$r.squareds[width:n] <- 1 - old_fraction * fac
#####
# assign R version
R_func <- function(X, y, w, width){
n <- nrow(X)
p <- ncol(X)
out <- matrix(NA_real_, n, p)
sigmas <- rep(NA_real_, n)
r.squared <- rep(NA_real_, n)
one_step_forecasts <- rep(NA_real_, n)
for(i in width:n){
idx <- 1:i
fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE], weights = w[idx])
out[i, ] <- fit$coefficients
su <- summary(fit)
sigmas[i] <- su$sigma
ss1 <- sum(w[idx] * (y[idx] - weighted.mean(y[idx], w[idx]))^2)
ss2 <- sum(w[idx] * fit$residuals^2)
r.squared[i] <- 1 - ss2 / ss1
if(i < n){
next_i <- i + 1L
one_step_forecasts[next_i] <- fit$coefficients %*% X[next_i, ]
}
}
list(coef = out, sigmas = sigmas, r.squared = r.squared,
one_step_forecasts = one_step_forecasts)
}
R_out <- R_func(X, y, w, 50L)
#####
# gives the same
stopifnot(
isTRUE(all.equal(R_out$coef , pck_out$coefs)),
isTRUE(all.equal(R_out$sigmas , pck_out$sigmas)),
isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))
# the R squared is correct
isTRUE(all.equal(R_out$r.squared , pck_out$r.squared))
#R> [1] TRUE
The residuals are recomputed making this somewhat slower than it can be.
You can replace
# re-scale the R^2
library(zoo)
mea_weighted <- cumsum(w * y) / cumsum(w)
mea_unweighted <- cumsum(w_sqrt_y) / 1:n
fac <- sapply(width:n, function(i) {
idx <- 1:i
sum((w_sqrt_y[idx] - mea_unweighted[i])^2) /
sum(w[idx] * (y[idx] - mea_weighted[i])^2)
})
old_fraction <- -(pck_out$r.squareds[width:n] - 1)
pck_out$r.squareds[width:n] <- 1 - old_fraction * fac
with two applications from here:
# re-scale the R^2. Use Welford's online algorithm
ss1_old <- ss1_new <- numeric(n)
ss1_old[1] <- ss1_new[1] <- 0
mea_old <- w_sqrt_y[1]
mea_new <- y[1]
w_sum <- w[1]
for(i in 2:n){
# Welford's online algorithm
mea_tmp <- mea_old
y_i <- w_sqrt_y[i]
mea_old <- mea_old + (y_i - mea_old) / i
ss1_old[i] <-
ss1_old[i - 1] + (y_i - mea_old) * (y_i - mea_tmp)
# Weighted incremental algorithm
mea_tmp <- mea_new
w_i <- w[i]
w_sum <- w_sum + w_i
y_i <- y[i]
mea_new <- mea_new + w_i / w_sum * (y_i - mea_new)
ss1_new[i] <- ss1_new[i - 1] + w_i * (y_i - mea_tmp) * (y_i - mea_new)
}
fac <- ss1_old[width:n] / ss1_new[width:n]
old_fraction <- -(pck_out$r.squareds[width:n] - 1)
pck_out$r.squareds[width:n] <- 1 - old_fraction * fac
One can likely make this pure R implementation faster (e.g. with Rcpp).
thank you very much for the response @boennecd! I absolutely will try this. just haven't had the time yet. Will let u know once i get this working