问题描述
我有以下数据表 dt.train
、days
的数量和函数 varImportance
,以获取线性模型的变量重要性:
library(data.table)
library(caret)
library(xgboost)
library(zoo)
days <- 50
set.seed(123)
dt.train <- data.table(date = seq(as.Date('2020-01-01'),by = '1 day',length.out = 366),'DE' = rnorm(366,30,1),'windDE' = rnorm(366,10,'consumptionDE' = rnorm(366,35,'nuclearDE' = rnorm(366,8,'solarDE' = rnorm(366,1,check.names = FALSE)
## Variable Importance Function: ##
## LINEAR MODEL: ##
varImportance <- function(data){
## Model fitting: ##
xgbModel <- stats::lm(DE ~ .-1,data = data.table(data))
varimp <- caret::varImp(xgbModel)
importance <- t(varimp)
}
## Iterative Variable Importance for Linear Model: ##
dt.importance <- as.data.frame(zoo::rollapply(dt.train[,!"date"],FUN = varImportance,width = days,by.column = FALSE,align = 'left')
)
## Adding date-column again: ##
dt.importance <- cbind(dt.train[1:nrow(dt.importance),.(date)],dt.importance)
这里一切正常,但我需要为梯度提升机器学习模型做同样的事情。我已经尝试过以相同的方式进行,其中模型拟合的准备是在 varImportance
函数中:
## Variable Importance function: ##
## GRADIENT BOOSTING: ##
varImportance <- function(data){
## Create response vector and predictor matrix: ##
v.trainY <- data$DE
m.trainData <- as.matrix(data[,c("date","DE") := list(NULL,NULL)])
## Hyper parameter tuning and grid search: ##
xgb_trcontrol <- caret::trainControl(method = "cv",number = 3,allowParallel = TRUE,verboseIter = TRUE,returnData = FALSE
)
xgbgrid <- base::expand.grid(nrounds = c(150),# 15000
max_depth = c(2),eta = c(0.01),gamma = c(1),colsample_bytree = c(1),min_child_weight = c(2),subsample = c(0.6)
)
## Model fitting: ##
xgbModel <- caret::train(m.trainData,v.trainY,trControl = xgb_trcontrol,tuneGrid = xgbgrid,method = "xgbTree"
)
varimp <- caret::varImp(xgbModel,scale = FALSE)
importance <- t(varimp$importance)
}
## Iterative Variable Importance for Gradient Boosting: ##
dt.importance <- as.data.frame(zoo::rollapply(dt.train,dt.importance)
不幸的是,这不能每 50 天迭代一次(抛出错误:$ operator is invalid for atomic vectors
)。 varImp()
函数中的 varImportance
在运行一次时适用于梯度提升模型。
编辑 1:
您的答案在使用梯度提升时会引发以下错误:
编辑 2:
当我评论 trControl = xgb_trcontrol
时,我收到以下错误:
解决方法
当您使用一些 data.table 函数时,您需要将输入转换回 data.table。 rollapply 将输入作为矩阵发送。您应该注意,您的第一列是日期,当 rollapply 将数据子集转换为矩阵时,所有内容都将转换为字符类。
由于您的函数中未使用日期,因此最好在 rollapply 函数中发送数据之前删除此列。但是,如果要发送完整的数据,则需要将所有内容从字符转换回数字。在下面的代码中,我只是在输入中删除日期列。
这是工作代码 -
library(data.table)
library(caret)
library(xgboost)
library(zoo)
days <- 50
set.seed(123)
dt.train <- data.table(date = seq(as.Date('2020-01-01'),by = '1 day',length.out = 366),'DE' = rnorm(366,30,1),'windDE' = rnorm(366,10,'consumptionDE' = rnorm(366,35,'nuclearDE' = rnorm(366,8,'solarDE' = rnorm(366,1,check.names = FALSE)
## GRADIENT BOOSTING: ##
varImportance <- function(data){
data = data.table(data.frame(data))
## Create response vector and predictor matrix: ##
v.trainY <- data$DE
m.trainData <- as.matrix(data[,c("DE") := list(NULL)])
## Hyper parameter tuning and grid search: ##
xgb_trcontrol <- caret::trainControl(method = "cv",number = 3,allowParallel = TRUE,verboseIter = TRUE,returnData = FALSE
)
xgbgrid <- base::expand.grid(nrounds = c(150),# 15000
max_depth = c(2),eta = c(0.3),gamma = c(1),colsample_bytree = c(1),min_child_weight = c(2),subsample = c(0.6)
)
## Model fitting: ##
xgbModel <- caret::train(m.trainData,v.trainY,trControl = xgb_trcontrol,tuneGrid = xgbgrid,method = "xgbTree" )
varimp <- caret::varImp(xgbModel,scale = FALSE)
importance <- t(varimp$importance)
}
## Iterative Variable Importance for Gradient Boosting: ##
dt.importance1 <- as.data.frame(zoo::rollapply(dt.train[,-1],FUN = varImportance,width = days,by.column = FALSE,align = 'left')
)
## Adding date-column again: ##
dt.importance <- cbind(dt.train[1:nrow(dt.importance1),.(date)],dt.importance1)