用整形模型训练的PLS模型的预测变量重要性

问题描述

我正在使用tidymodels拟合PLS模型，但是我正在努力寻找PLS变量的重要度得分或系数。

这是我到目前为止尝试过的；示例数据来自AppliedPredictiveModeling包。

建模配件

data(ChemicalManufacturingProcess) 
split <- ChemicalManufacturingProcess %>% initial_split(prop = 0.7)
train <- training(split)
test <- testing(split)

tidy_rec <- recipe(Yield ~ .,data = train) %>% 
  step_knnimpute(all_predictors()) %>% 
  step_BoxCox(all_predictors()) %>% 
  step_normalize(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_corr(all_predictors())

boots <- bootstraps(time = 25,data = train)

tidy_model <- plsmod::pls(num_comp = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("mixOmics")

tidy_grid <- expand.grid(num_comp = seq(from = 1,to = 48,by = 5))

tidy_tune <- tidy_model %>% tune_grid(
  preprocessor = tidy_rec,grid = tidy_grid,resamples = boots,metrics = metric_set(mae,rmse,rsq)
)

tidy_best <- tidy_tune %>% select_best("rsq")
Final_model <- tidy_model %>% finalize_model(tidy_best)

tidy_wf <- workflow() %>% 
  add_model(Final_model) %>% 
  add_recipe(tidy_rec) 

Fit_PLS <- tidy_wf %>% fit(data = train)

# check the most important predictors
tidy_info <- Fit_PLS %>% pull_workflow_fit()
loadings <- tidy_info$fit$loadings$X

PLS变量重要性

tidy_load <- loadings %>% as.data.frame() %>% rownames_to_column() %>% 
  select(rowname,comp1,comp2,comp3) %>% 
  pivot_longer(-rowname) %>% 
  rename(predictors = rowname)

tidy_load %>% mutate(Sing = if_else(value < 0,"neg","pos")) %>% 
  mutate(absvalue = abs(value)) %>% group_by(predictors) %>% summarise(Importance = sum(absvalue)) %>% 
  mutate(predictors = fct_reorder(predictors,Importance)) %>% 
  slice_head(n = 15) %>% 
  ggplot(aes(Importance,predictors,fill = predictors)) + geom_col(show.legend = F)

谢谢！ vip软件包中的vi()函数不适用于该型号。

解决方法

您可以直接 tidy() PLS模型的输出以获取系数：

library(tidymodels)
library(tidyverse)
library(plsmod)

data(ChemicalManufacturingProcess,package = "AppliedPredictiveModeling") 
split <- initial_split(ChemicalManufacturingProcess,prop = 0.7)
train <- training(split)
test <- testing(split)

chem_rec <- recipe(Yield ~ .,data = train) %>% 
  step_knnimpute(all_predictors()) %>% 
  step_BoxCox(all_predictors()) %>% 
  step_normalize(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_corr(all_predictors())

pls_spec <- pls(num_comp = 4) %>%    ## can tune instead to find the optimal number
  set_mode("regression") %>% 
  set_engine("mixOmics")

wf <- workflow() %>%
  add_recipe(chem_rec) %>%
  add_model(pls_spec)


pls_fit <- fit(wf,train)

## tidy the fitted model
tidy_pls <- pls_fit %>%
  pull_workflow_fit()
  tidy()

tidy_pls
#> # A tibble: 192 x 4
#>    term                    value type       component
#>    <chr>                   <dbl> <chr>          <dbl>
#>  1 BiologicalMaterial01  0.193   predictors         1
#>  2 BiologicalMaterial01 -0.247   predictors         2
#>  3 BiologicalMaterial01  0.00969 predictors         3
#>  4 BiologicalMaterial01  0.0228  predictors         4
#>  5 BiologicalMaterial03  0.249   predictors         1
#>  6 BiologicalMaterial03 -0.00118 predictors         2
#>  7 BiologicalMaterial03  0.0780  predictors         3
#>  8 BiologicalMaterial03 -0.0866  predictors         4
#>  9 BiologicalMaterial04  0.217   predictors         1
#> 10 BiologicalMaterial04 -0.192   predictors         2
#> # … with 182 more rows

tidy_pls %>%
  filter(term != "Y") %>%
  group_by(component) %>%
  slice_max(abs(value),n = 10) %>%
  ungroup() %>%
  ggplot(aes(value,fct_reorder(term,value),fill = factor(component))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component,scales = "free_y") +
  labs(y = NULL)

^{由reprex package（v0.3.0.9001）于2020-10-19创建}

我在不调整组件数量的情况下显示了此信息，但是在调整时它的效果大致相同。

pls r r tidymodels