问题描述
|
我正在运行与以下代码非常相似的滚动回归:
library(PerformanceAnalytics)
library(quantmod)
data(managers)
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
system.time(Result <- rollapply(managers,31,FUN=\"MyRegression\",FL,by.column = FALSE,align = \"right\",na.pad = TRUE))
我有一些额外的处理器,所以我试图找到一种方法来并行化滚动窗口。如果这是非滚动回归,那么我可以使用apply系列函数轻松并行化它。
解决方法
很明显的一个方法是使用
lm.fit()
而不是lm()
,这样就不会在处理公式等方面产生所有开销。
更新:所以当我说的很明显时,我的意思是说的很明显,但是看似难以实现!
经过一番摆弄之后,我想到了这个
library(PerformanceAnalytics)
library(quantmod)
data(managers)
第一步是认识到可以预先构建模型矩阵,因此我们将其转换回Zoo对象以用于rollapply()
:
mmat2 <- model.frame(Next(HAM1) ~ HAM1 + HAM2 + HAM3 + HAM4,data = managers,na.action = na.pass)
mmat2 <- cbind.data.frame(mmat2[,1],Intercept = 1,mmat2[,-1])
mmatZ <- as.zoo(mmat2)
现在,我们需要一个函数使用employ1进行繁重的工作,而不必在每次迭代时都创建设计矩阵:
MyRegression2 <- function(Z) {
## store value we want to predict for
pred <- Z[31,-1,drop = FALSE]
## get rid of any rows with NA in training data
Z <- Z[1:30,][!rowSums(is.na(Z[1:30,])) > 0,]
## Next() would lag and leave NA in row 30 for response
## but we precomputed model matrix,so drop last row still in Z
Z <- Z[-nrow(Z),]
## fit the model
fit <- lm.fit(Z[,drop = FALSE],Z[,1])
## get things we need to predict,in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[,piv,drop = FALSE] %*% beta[piv])
}
计时比较:
> system.time(Result <- rollapply(managers,31,FUN=\"MyRegression\",FL,+ by.column = FALSE,align = \"right\",+ na.pad = TRUE))
user system elapsed
0.925 0.002 1.020
>
> system.time(Result2 <- rollapply(mmatZ,FUN = MyRegression2,+ by.column = FALSE,+ na.pad = TRUE))
user system elapsed
0.048 0.000 0.05
与原始版本相比,这提供了相当合理的改进。现在检查结果对象是否相同:
> all.equal(Result,Result2)
[1] TRUE
请享用!
,新答案
我写了一个10英镑的包裹,这样做的速度要快得多。比盖文·辛普森的答案快58倍。这是一个例子
# simulate data
set.seed(101)
n <- 10000
wdth <- 50
X <- matrix(rnorm(10 * n),n,10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y,X)
# assign other function
lm_version <- function(Z,width = wdth) {
pred <- Z[width + 1,drop = FALSE]
## fit the model
Z <- Z[-nrow(Z),]
fit <- lm.fit(Z[,1])
## get things we need to predict,in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for next obs
drop(pred[,drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(rollRegres) # the new package
library(zoo)
all.equal(
rollapply(Z,wdth + 1,FUN = lm_version,by.column = FALSE,fill = NA_real_),roll_regres.fit(
x = X,y = y,width = wdth,do_compute = \"1_step_forecasts\"
)$one_step_forecasts)
#R [1] TRUE
# benchmark
library(compiler)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
newnew = roll_regres.fit(
x = X,do_compute = \"1_step_forecasts\"),prev = rollapply(Z,times = 10)
#R Unit: milliseconds
#R expr min lq mean median uq max neval
#R newnew 10.27279 10.48929 10.91631 11.04139 11.13877 11.87121 10
#R prev 555.45898 565.02067 582.60309 582.22285 602.73091 605.39481 10
旧答案
您可以通过更新分解来减少运行时间。这将在每次迭代中产生一个成本,而不是n是您的窗口宽度。下面是比较两者的代码。在C ++中这样做可能会快得多,但是R中不包含LINPACKdchud
和dchdd
,因此您必须编写一个程序包才能这样做。此外,我记得阅读过的文章,对于R更新,您可能会比LINPACKdchud
和dchdd
更快地使用其他实现
library(SamplerCompare) # for LINPACK `chdd` and `chud`
roll_forcast <- function(X,y,width){
n <- nrow(X)
p <- ncol(X)
out <- rep(NA_real_,n)
is_first <- TRUE
i <- width
while(i < n){
if(is_first){
is_first <- FALSE
qr. <- qr(X[1:width,])
R <- qr.R(qr.)
# Use X^T for the rest
X <- t(X)
XtY <- drop(tcrossprod(y[1:width],X[,1:width]))
} else {
x_new <- X[,i]
x_old <- X[,i - width]
# update R
R <- .Fortran(
\"dchud\",R,p,x_new,0.,0L,numeric(p),PACKAGE = \"SamplerCompare\")[[1]]
# downdate R
R <- .Fortran(
\"dchdd\",x_old,integer(1),PACKAGE = \"SamplerCompare\")[[1]]
# update XtY
XtY <- XtY + y[i] * x_new - y[i - width] * x_old
}
coef. <- .Internal(backsolve(R,XtY,TRUE,TRUE))
coef. <- .Internal(backsolve(R,coef.,FALSE))
i <- i + 1
out[i] <- X[,i] %*% coef.
}
out
}
# simulate data
set.seed(101)
n <- 10000
wdth = 50
X <- matrix(rnorm(10 * n),in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[,drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(zoo)
all.equal(
rollapply(Z,roll_forcast(X,wdth))
#R> [1] TRUE
# benchmark
library(compiler)
roll_forcast <- cmpfun(roll_forcast)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
new = roll_forcast(X,wdth),prev = rollapply(Z,times = 10)
#R> Unit: milliseconds
#R> expr min lq mean median uq max neval cld
#R> new 113.7637 115.4498 129.6562 118.6540 122.4930 230.3414 10 a
#R> prev 639.6499 674.1677 682.1996 678.6195 686.8816 763.8034 10 b