I'm running a rolling regression very similar to the following code:
library(PerformanceAnalytics)
library(quantmod)
data(managers)
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,开发者_JAVA百科FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
by.column = FALSE, align = "right", na.pad = TRUE))
I've got some extra processors, so I'm trying to find a way to parallelize the rolling window. If this was a non-rolling regression I could easily parallelize it using the apply family of functions...
The obvious one is to use lm.fit()
instead of lm()
so you don't incur all the overhead in processing the formula etc.
Update: So when I said obvious what I meant to say was blindingly obvious but deceptively difficult to implement!
After a bit of fiddling around, I came up with this
library(PerformanceAnalytics)
library(quantmod)
data(managers)
The first stage is to realise that the model matrix can be prebuilt, so we do that and convert it back to a Zoo object for use with rollapply()
:
mmat2 <- model.frame(Next(HAM1) ~ HAM1 + HAM2 + HAM3 + HAM4, data = managers,
na.action = na.pass)
mmat2 <- cbind.data.frame(mmat2[,1], Intercept = 1, mmat2[,-1])
mmatZ <- as.zoo(mmat2)
Now we need a function that will employ lm.fit()
to do the heavy lifting without having to create design matrices at each iteration:
MyRegression2 <- function(Z) {
## store value we want to predict for
pred <- Z[31, -1, drop = FALSE]
## get rid of any rows with NA in training data
Z <- Z[1:30, ][!rowSums(is.na(Z[1:30,])) > 0, ]
## Next() would lag and leave NA in row 30 for response
## but we precomputed model matrix, so drop last row still in Z
Z <- Z[-nrow(Z),]
## fit the model
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
A comparison of timings:
> system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
+ by.column = FALSE, align = "right",
+ na.pad = TRUE))
user system elapsed
0.925 0.002 1.020
>
> system.time(Result2 <- rollapply(mmatZ, 31, FUN = MyRegression2,
+ by.column = FALSE, align = "right",
+ na.pad = TRUE))
user system elapsed
0.048 0.000 0.05
Which affords a pretty reasonable improvement over the original. And now check that the resulting objects are the same:
> all.equal(Result, Result2)
[1] TRUE
Enjoy!
New answer
I wrote a package, rollRegres
, that does this much faster. It is ~ 58 times faster than Gavin Simpson's answer. Here is an example
# simulate data
set.seed(101)
n <- 10000
wdth <- 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
# assign other function
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
## fit the model
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for next obs
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(rollRegres) # the new package
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"
)$one_step_forecasts)
#R [1] TRUE
# benchmark
library(compiler)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
newnew = roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
#R Unit: milliseconds
#R expr min lq mean median uq max neval
#R newnew 10.27279 10.48929 10.91631 11.04139 11.13877 11.87121 10
#R prev 555.45898 565.02067 582.60309 582.22285 602.73091 605.39481 10
Old answer
You can reduce the run time by updating a decomposition. This yields an cost at each iteration instead of where n is you window width. Below is a code to compare the two. It would likely be much faster doing it in C++ but the LINPACK dchud
and dchdd
are not included with R so you would have to write a package to do so. Further, I recall reading that you may do faster with other implementations than the LINPACK dchud
and dchdd
for the R update
library(SamplerCompare) # for LINPACK `chdd` and `chud`
roll_forcast <- function(X, y, width){
n <- nrow(X)
p <- ncol(X)
out <- rep(NA_real_, n)
is_first <- TRUE
i <- width
while(i < n){
if(is_first){
is_first <- FALSE
qr. <- qr(X[1:width, ])
R <- qr.R(qr.)
# Use X^T for the rest
X <- t(X)
XtY <- drop(tcrossprod(y[1:width], X[, 1:width]))
} else {
x_new <- X[, i]
x_old <- X[, i - width]
# update R
R <- .Fortran(
"dchud", R, p, p, x_new, 0., 0L, 0L,
0., 0., numeric(p), numeric(p),
PACKAGE = "SamplerCompare")[[1]]
# downdate R
R <- .Fortran(
"dchdd", R, p, p, x_old, 0., 0L, 0L,
0., 0., numeric(p), numeric(p), integer(1),
PACKAGE = "SamplerCompare")[[1]]
# update XtY
XtY <- XtY + y[i] * x_new - y[i - width] * x_old
}
coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE))
coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE))
i <- i + 1
out[i] <- X[, i] %*% coef.
}
out
}
# simulate data
set.seed(101)
n <- 10000
wdth = 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
# assign other function
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
## fit the model
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_forcast(X, y, wdth))
#R> [1] TRUE
# benchmark
library(compiler)
roll_forcast <- cmpfun(roll_forcast)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
new = roll_forcast(X, y, wdth),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
#R> Unit: milliseconds
#R> expr min lq mean median uq max neval cld
#R> new 113.7637 115.4498 129.6562 118.6540 122.4930 230.3414 10 a
#R> prev 639.6499 674.1677 682.1996 678.6195 686.8816 763.8034 10 b
精彩评论