问题描述
# Packages
library(dplyr)
library(recipes)
# toy dataset,with A being multicolored
df <- tibble(name = c("A","A","B","C"),color = c("green","yellow","purple","green","blue"))
#> # A tibble: 5 x 2
#> name color
#> <chr> <chr>
#> 1 A green
#> 2 A yellow
#> 3 A purple
#> 4 B green
#> 5 C blue
食谱步骤效果很好
dummified_df <- recipe(. ~ .,data = df) %>%
step_dummy(color,one_hot = TRUE) %>%
prep(training = df) %>%
juice()
#> # A tibble: 5 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 0 0
#> 2 A 0 0 0 1
#> 3 A 0 0 1 0
#> 4 B 0 1 0 0
#> 5 C 1 0 0 0
但是我真正想要获得的结果是下面的结果,现在每行一个观察值,因为彩色项目不再需要多行了。
summarized_dummified_df <- dummified_df %>%
group_by(name) %>%
summarise_all(~ifelse(max(.) > 0,1,0)) %>%
ungroup()
#> # A tibble: 3 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 1 1
#> 2 B 0 1 0 0
#> 3 C 1 0 0 0
很显然,我可以这样做。
但是为了将我的配方步骤完全集成到tidymodels
生态系统中(例如,通过工作流程),如果我可以对不再需要重复的行进行分组,这要好得多,这要归功于配方内部直接存在虚拟变量
是否有任何通过 tidymodels-认可的方法来获得此结果?
我也尝试使用mlr3
来执行此操作,但无济于事,因为找不到合适的PipeOp
来聚合行。
library("mlr3")
library("mlr3pipelines")
task = TaskClassif$new("task",data.table::data.table(
name = c("A",color = as.factor(c("green","blue")),price = as.factor(c("low","low","high","low"))),"price"
)
poe = po("encode")
poe$train(list(task))[[1]]$data()
#> price name color.blue color.green color.purple color.yellow
#> 1: low A 0 1 0 0
#> 2: low A 0 0 0 1
#> 3: low A 0 0 1 0
#> 4: high B 0 1 0 0
#> 5: low C 1 0 0 0
我正在研究custom step_
函数或custom PipeOp
的创建,但是我仍然感觉缺少某些东西,因为我的数据类型对那没什么对我来说并不常见。
解决方法
在我所见过的任何地方,虚拟变量或指标变量在概念上都是一对一的映射,而不是一对多的映射,我认为这就是您遇到此问题的原因。但是,像您一样,我想在现实世界中的某个时候一对多地映射它们。通常,在开始模型预处理工作流之前,我会在数据整理步骤中执行此操作,如下所示:
library(tidyverse)
# toy dataset,with A being multicolored
df <- tibble(name = c("A","A","B","C"),color = c("green","yellow","purple","green","blue"))
df %>%
mutate(value = 1) %>%
pivot_wider(names_from = "color",names_prefix = "color_",values_from = "value",values_fill = 0)
#> # A tibble: 3 x 5
#> name color_green color_yellow color_purple color_blue
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 1 1 0
#> 2 B 1 0 0 0
#> 3 C 0 0 0 1
由reprex package(v0.3.0.9001)于2020-08-18创建
,我为食谱包编写了以下自定义步骤。
step_summarize <- function(
recipe,...,role = NA,trained = FALSE,col_names = NULL,skip = FALSE,id = rand_id("summarize")
){
terms <- ellipse_check(...)
add_step(
recipe,step_summarize_new(
terms = terms,role = role,trained = trained,col_names = col_names,skip = skip,id = id
)
)
}
step_summarize_new <-
function(terms,role,trained,col_names,skip,id) {
step(
subclass = "summarize",terms = terms,id = id
)
}
prep.step_summarize <- function(x,training,info = NULL,...) {
col_names <- terms_select(terms = x$terms,info = info)
step_summarize_new(
terms = x$terms,trained = TRUE,role = x$role,skip = x$skip,id = x$id
)
}
bake.step_summarize <- function(object,new_data,...) {
vars <- object$col_names
new_data <- new_data %>%
group_by(across(- any_of(vars))) %>%
summarise(across(any_of(vars),~ifelse(max(.) > 0,1,0)))
## Always convert to tibbles on the way out
tibble::as_tibble(new_data)
}
它可以作为实际数据集上的预处理步骤正常工作,但是在使用tune
的情况下更进一步。
这可能与this issue