问题描述
我有两个数据框,我想有条件地从一个数据框的一列中提取数据,然后将其放入另一个数据框的新列中。
数据帧1看起来像这样:
df1 <- data.frame(date.start = c("2019-06-10 11:52:00","2019-06-11 11:52:00","2019-06-12 11:51:00"),date.end =
c("2019-06-10 11:53:00","2019-06-11 11:53:00","2019-06-12 11:53:00"))
数据框2如下所示:
df2 <- data.frame(date.start = c("2019-06-11 11:50:00","2019-06-10 11:51:00","2019-06-12 11:50:00"),date.end =
c("2019-06-11 11:54:00","2019-06-11 08:59:00","2019-06-12 11:57:00"),day = c(1,15,64))
如果df.1的date.start
和date.end
落在date.start
任何行的date.end
或df2
内,我要提取变量{ day
中的{1}},并将其放入df2
的匹配行中。
预期结果如下:
df1
我目前有以下循环可以正常工作,但是当我在大型数据帧(行= 1135133)上运行它时,它的运行速度非常慢,我想知道是否有更快的方法可以实现这一点。
expected.out <- data.frame(date.start = c("2019-06-10 11:52:00",date.end = c("2019-06-10 11:53:00","2019-06-12 11:53:00"),day = c(15,1,64))
解决方法
使用library(fuzzyjoin)
library(tidyverse)
library(lubridate)
library(fuzzyjoin)
df1 <- data.frame(
date.start = c("2019-06-10 11:52:00","2019-06-11 11:52:00","2019-06-12 11:51:00"),date.end = c("2019-06-10 11:53:00","2019-06-11 11:53:00","2019-06-12 11:53:00"),stringsAsFactors = F)
df2 <- data.frame(date.start = c("2019-06-11 11:50:00","2019-06-10 11:51:00","2019-06-12 11:50:00"),date.end = c("2019-06-11 11:54:00","2019-06-11 08:59:00","2019-06-12 11:57:00"),day = c(1,15,64),stringsAsFactors = F)
df1 <- df1 %>%
mutate(across(where(is.character),ymd_hms)) %>%
as_tibble()
df2 <- df2 %>%
mutate(across(where(is.character),ymd_hms)) %>%
as_tibble()
fuzzy_left_join(df1,df2,by = c("date.start","date.end"),match_fun = list(`>=`,`<=`))
# A tibble: 3 x 5
date.start.x date.end.x date.start.y date.end.y day
<dttm> <dttm> <dttm> <dttm> <dbl>
1 2019-06-10 11:52:00 2019-06-10 11:53:00 2019-06-10 11:51:00 2019-06-11 08:59:00 15
2 2019-06-11 11:52:00 2019-06-11 11:53:00 2019-06-11 11:50:00 2019-06-11 11:54:00 1
3 2019-06-12 11:51:00 2019-06-12 11:53:00 2019-06-12 11:50:00 2019-06-12 11:57:00 64
由reprex package(v0.3.0)于2020-09-23创建
不确定该方法是否快速
,您可以在match
内使用sapply
来获取df2
的第一行,其中日期在给定时间范围内。
df1[] <- lapply(df1,as.POSIXct) #Convert character to POSIXct
df2[1:2] <- lapply(df2[1:2],as.POSIXct)
df1$day <- df2$day[sapply(asplit(df1,1),function(x) {match(TRUE,x[1] >= df2[,1] & x[2] <= df2[,2])})]
df1
# date.start date.end day
#1 2019-06-10 11:52:00 2019-06-10 11:53:00 15
#2 2019-06-11 11:52:00 2019-06-11 11:53:00 1
#3 2019-06-12 11:51:00 2019-06-12 11:53:00 64
,
将between
中的data.table
与outer
一起使用。 which.max
扫描匹配矩阵中的TRUE
值。
library(data.table)
FUN <- Vectorize(function(x,y) all(between(unlist(df1[x,]),df2[y,1],2])))
res <- transform(df1,day=df2[apply(outer(1:3,1:3,FUN),1,which.max),3])
res
# date.start date.end day
# 1 2019-06-10 11:52:00 2019-06-10 11:53:00 15
# 2 2019-06-11 11:52:00 2019-06-11 11:53:00 1
# 3 2019-06-12 11:51:00 2019-06-12 11:53:00 64
您可能需要预先转换为POSIXct
格式以应用解决方案。
df1[1:2] <- lapply(df1[1:2],as.POSIXct)
df2[1:2] <- lapply(df2[1:2],as.POSIXct)
数据:
df1 <- structure(list(date.start = structure(c(1560160320,1560246720,1560333060),class = c("POSIXct","POSIXt"),tzone = ""),date.end = structure(c(1560160380,1560246780,1560333180),tzone = "")),row.names = c(NA,-3L),class = "data.frame")
df2 <- structure(list(date.start = structure(c(1560246600,1560160260,1560333000),date.end = structure(c(1560246840,1560236340,1560333420),64)),class = "data.frame")