问题描述
此问题与 Tidymodels: What is the correct way to impute missing values in a Date column? 重复
问题结束后,我提供了一个 reprex
并再次提出问题。
我对 Date 列中的缺失值有点挣扎。
在我的预处理管道 (recipe
-object) 中,我使用 step_impute_knn
函数填充所有日期列中的缺失值。不幸的是,我收到以下错误:
分配的数据 pred_vals 必须与现有数据兼容。?列 avg_begin_first_contract .x 发生错误无法将双精度转换为日期
这是一个 reprex
版本,用于我在多列中估算值,包括 Date
列。如果我仅将值插补到 Date
列,这对我来说并不重要。结果是一样的。下面有一个 reprex
,它不会通过错误,因为没有使用 Date
列。
以前有人遇到过这个问题吗?
library(tidyverse)
library(tidymodels)
iris <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"),as.Date("2000/01/01"),by = "day"
),size = 150))
iris[45,2] <- as.numeric(NA)
iris[37,3] <- as.numeric(NA)
iris[78,4] <- as.numeric(NA)
iris[9,5] <- as.numeric(NA)
iris[15,6] <- as.factor(NA)
set.seed(456)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,min_n = 10,trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,data = iris_training
) %>%
step_impute_knn(Sepal.Width,Petal.Length,Petal.Width,Species,Plucked) %>%
step_date(Plucked) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)
#> x train/test split: preprocessor 1/1: Error: Assigned data `pred_vals` must be compatible wi...
#> Warning: All models Failed. See the `.notes` column.
Created on 2021-06-15 by the reprex package (v2.0.0)
这是 reprex,它不会通过错误:
library(tidyverse)
library(tidymodels)
iris[45,5] <- as.numeric(NA)
set.seed(123)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 5,min_n = 5,trees = 500) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,data = iris_training) %>%
step_impute_knn(Sepal.Width,Species) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(split = iris_split)
Created on 2021-06-15 by the reprex package (v2.0.0)
提前致谢! 米。
解决方法
我怀疑 step_impute_knn
不适用于日期格式。您可能必须先将其转换为因子。你可以试试下面的代码吗?
iris_n <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"),as.Date("2000/01/01"),by = "day"
),size = 150)) %>%
mutate(Plucked = as.factor(Plucked)) #convert date into factor
iris_n[45,2] <- NA
iris_n[37,3] <- NA
iris_n[78,4] <- NA
iris_n[9,5] <- NA
iris_n[15,6] <- NA
set.seed(456)
iris_split <- iris_n %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,min_n = 10,trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,data = iris_training
) %>%
step_impute_knn(Sepal.Width,Petal.Length,Petal.Width,Species,Plucked) %>%
#step_date(Plucked) %>% #might not need this step anymore
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)