问题描述
我正在尝试使用 Tidymodels 预测 R 中的房地产价格。我正在关注this tutorial。一切顺利,直到我尝试对我的测试数据进行预测。
我查看了两个类似的问题(here 和 here),但似乎我已经定义了变量角色并为我的工作流程提供了一个未准备好的方法。
# libraries ---------------------------------------------------------------
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(data.table)
library(purrr)
# data --------------------------------------------------------------------
# 're' means real estate
# I'm using data.table in general. Using tribble below for cleaner data deFinition.
real_estate_data <- tibble::tribble(
~re_id,~price_per_sqm_huf_mil,~district,~num_room,"30876343",0.534722222222222,1,3,"31914489",0.476119402985075,"30972289",0.507352941176471,2,"31739730",0.472972972972973,"31783137",0.49875,"31809435",0.439705882352941,"31943408",0.469117647058824,"31944348",0.56231884057971,"31961146","24314388",0.649550561797753,"29840270",0.719178082191781,"29840429","30873484",0.822857142857143,4,"30969673",0.533802816901408,"31333120",0.741511627906977,"31788730",0.527142857142857,"31948441",0.734848484848485,5,"31962350",0.8,"31962779",0.670454545454545,"31979128",0.689054054054054,1
)
real_estate_data <- as.data.table(real_estate_data) %>% .[,district := factor(district)]
# train/test split --------------------------------------------------------
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
# workflow (w/ recipe) ----------------------------------------------------
re_rec <- recipe(re_train,formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id,new_role = "ID") %>%
step_center(all_numeric(),- district) %>%
step_scale(all_predictors(),all_numeric(),- district) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
# model training and prediction -------------------------------------------
re_fit <-
re_wflow %>%
fit(data = re_train)
re_pred <- predict(re_fit,re_test)
#> Error: Can't subset columns that don't exist.
#> x Column `price_per_sqm_huf_mil` doesn't exist.
由 reprex package (v0.3.0) 于 2021 年 1 月 25 日创建
非常感谢!
解决方法
这里的问题是您使用 step_center()
来转换结果 (price_per_sqm_huf_mil
) 并且在预测时没有可用的结果。您可以改为指定要像这样将 all_predictors() & all_numeric()
居中:
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(dplyr)
real_estate_data <- tibble::tribble(
~re_id,~price_per_sqm_huf_mil,~district,~num_room,"30876343",0.534722222222222,1,3,"31914489",0.476119402985075,"30972289",0.507352941176471,2,"31739730",0.472972972972973,"31783137",0.49875,"31809435",0.439705882352941,"31943408",0.469117647058824,"31944348",0.56231884057971,"31961146","24314388",0.649550561797753,"29840270",0.719178082191781,"29840429","30873484",0.822857142857143,4,"30969673",0.533802816901408,"31333120",0.741511627906977,"31788730",0.527142857142857,"31948441",0.734848484848485,5,"31962350",0.8,"31962779",0.670454545454545,"31979128",0.689054054054054,1
) %>%
mutate(district = factor(district))
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
re_rec <- recipe(re_train,formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id,new_role = "ID") %>%
step_center(all_predictors() & all_numeric()) %>%
step_scale(all_predictors() & all_numeric()) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
re_fit <-
re_wflow %>%
fit(data = re_train)
predict(re_fit,new_data = re_test)
#> # A tibble: 5 x 1
#> .pred
#> <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768
由 reprex package (v0.3.0) 于 2021 年 1 月 25 日创建
这让更多人被绊倒,因此我们正在努力添加一个即将合并的 new set of selectors。如果你真的想尝试改变结果,另一个可以考虑的选择是look into using skip = TRUE
。