问题描述
我有两个数据帧( df1 和 df2 ),我想以正确的顺序从df2到df1插入所有连续的行(如果它们不存在)在当前位置。 索引列决定是否插入,但是此列不是唯一的,因此它可能在df1或df2中多次出现。我有一些代码,但是对于更大的数据帧来说它非常慢,所以我想问一下是否有一种方法可以更有效地做到这一点。如果运行我的代码,您将看到预期的结果。 预先感谢您的帮助。
df1如下:
df1 <- data.frame(datetime=c("2016-03-02 16:44:32 UTC","2016-03-02 16:51:32 UTC","2016-03-02 16:53:45 UTC","2016-03-02 19:12:15 UTC","2016-03-02 19:12:32 UTC","2016-03-02 19:12:36 UTC","2016-03-02 19:13:50 UTC","2016-03-03 05:44:54 UTC","2016-03-03 05:45:06 UTC","2016-03-03 05:45:11 UTC","2016-03-03 05:45:27 UTC","2016-03-03 05:45:42 UTC","2016-03-03 05:45:52 UTC","2016-03-03 05:45:57 UTC","2016-03-03 05:46:12 UTC","2016-03-03 05:46:23 UTC","2016-03-03 05:46:29 UTC","2016-03-03 05:46:45 UTC","2016-03-03 05:47:03 UTC","2016-03-03 05:47:19 UTC","2016-03-03 05:47:37 UTC","2016-03-03 05:47:51 UTC","2016-03-03 05:47:56 UTC","2016-03-03 05:48:03 UTC","2016-03-03 05:48:04 UTC","2016-03-03 05:48:10 UTC","2016-03-03 05:48:18 UTC","2016-03-03 05:48:27 UTC","2016-03-03 05:48:45 UTC"),Index=c(rep(NA,7),"68362","68364","68352","427292","255720","255721","255713"))
df2如下:
df2 <- data.frame(Index=c("68362","68363","68365","68351","68373","68372","68371","255713"))
我的代码如下:
library(lubridate)
insertFromDF2 <- function(df_1,df_2) {
#datetime is in wrong format
df_1$datetime <- parse_date_time(df_1$datetime,orders="ymd HMS")
#iteration index will be used to access df_1
iteration <- 1
#help var for storing seconds
last_date <- 1
temp <- data.frame()
#list used to store all inserted rows
l <- list()
#list index will be used to access l
l_index <- 1
newR <- data.frame(datetime=NA,Index=NA)
#loop through the df_2 and check if row with current Index exists in df_1
for (i in 1:nrow(df_2)) {
#skip NA rows in df1
while(is.na(df_1$Index[iteration])) {
iteration <- iteration + 1
}
#skip if rows with current Index exist in df_1
if(df_1$Index[iteration] == df_2$Index[i]) {
while(df_1$Index[iteration] == df_2$Index[i] & iteration < nrow(df_1)) {
iteration <- iteration + 1
}
last_date <- 1
next
}
#I used datetime as a help column for correct order,I used time from the prevIoUs row and add 1 second
newR$datetime <- df_1$datetime[iteration - 1] + lubridate::seconds(last_date)
#copy columns from df_2 to new row,which will be inserted do df_1
newR[2] <- df_2$Index[i]
#increment help var for storing seconds,which will be used in next iteration
last_date <- last_date + 1
#store new row in list
l[[l_index]] <- newR
l_index <- l_index + 1
}
#at the end call rbind to combine list with df_1
temp <- do.call("rbind",l)
#highlight inserted rows
temp$inserted <- 1
df_1$inserted <- 0
df_1 <- rbind(temp,df_1)
#order by datetime
df_1 <- df_1[order(df_1$datetime),]
return (df_1)
}
#make some magic
df <- insertFromDF2(df1,df2)
解决方法
我尝试运行您的代码,但是说rbind找到不同的列名却出错,但是我在这里给出了答案:
df2 %>%
# use this if you want to filter out the indices that are found in df1
# remove if you want to keep all the rows
filter(!Index %in% df1$Index) %>%
# bind the resulting df with df1
bind_rows(df1,.) %>%
# convert the columns to their true types
mutate(datetime= ymd_hms(datetime),Index=as.numeric(Index)) %>%
# arrange by index and datetime the tibble step is important somehow a simple data.frame forgets the types
arrange(Index,datetime) %>%
# add a flag if date is NA
mutate(nas = is.na(datetime)) %>%
# fill the dates by the date that was above if value is na
fill(datetime,.direction="downup") %>% group_by(datetime) %>%
# add the cumsum of na flag column to the datetime to mimic adding n seconds if date was at first na
mutate(datetime=datetime + cumsum(nas),nas=NULL) %>% ungroup %>% arrange(datetime)
# A tibble: 35 x 5
datetime Index someColumn1 someColumn2 nas
<dttm> <dbl> <chr> <chr> <lgl>
1 2016-03-02 16:44:32 NA Anything Anything FALSE
2 2016-03-02 16:51:32 NA Anything Anything FALSE
3 2016-03-02 16:53:45 NA Anything Anything FALSE
4 2016-03-02 19:12:15 NA Anything Anything FALSE
5 2016-03-02 19:12:32 NA Anything Anything FALSE
6 2016-03-02 19:12:36 NA Anything Anything FALSE
7 2016-03-02 19:13:50 NA Anything Anything FALSE
8 2016-03-03 05:44:54 68362 Anything Anything FALSE
9 2016-03-03 05:44:55 68363 Anything Anything TRUE
10 2016-03-03 05:45:06 68364 Anything Anything FALSE
# … with 25 more rows
数据
问题中提供的相同数据,但设置为stringsAsFactors = FALSE
df2 <- data.frame(Index=c("68362","68363","68364","68365","68352","68351","68373","68372","68371","427292","255720","255721","255713"),someColumn1=rep("Anything",13),someColumn2=rep("Anything",stringsAsFactors = F)
df1 <- data.frame(datetime=c("2016-03-02 16:44:32 UTC","2016-03-02 16:51:32 UTC","2016-03-02 16:53:45 UTC","2016-03-02 19:12:15 UTC","2016-03-02 19:12:32 UTC","2016-03-02 19:12:36 UTC","2016-03-02 19:13:50 UTC","2016-03-03 05:44:54 UTC","2016-03-03 05:45:06 UTC","2016-03-03 05:45:11 UTC","2016-03-03 05:45:27 UTC","2016-03-03 05:45:42 UTC","2016-03-03 05:45:52 UTC","2016-03-03 05:45:57 UTC","2016-03-03 05:46:12 UTC","2016-03-03 05:46:23 UTC","2016-03-03 05:46:29 UTC","2016-03-03 05:46:45 UTC","2016-03-03 05:47:03 UTC","2016-03-03 05:47:19 UTC","2016-03-03 05:47:37 UTC","2016-03-03 05:47:51 UTC","2016-03-03 05:47:56 UTC","2016-03-03 05:48:03 UTC","2016-03-03 05:48:04 UTC","2016-03-03 05:48:10 UTC","2016-03-03 05:48:18 UTC","2016-03-03 05:48:27 UTC","2016-03-03 05:48:45 UTC"),Index=c(rep(NA,7),"68362",29),stringsAsFactors = F)
,
我找到了一个解决方案,它基于@Abdessabour Mtk的解决方案,并进行了一些改进。我的代码考虑到索引列不是唯一的,因此它可能在df1中出现多次,并且插入的行具有正确的日期时间。
df2 %>%
# bind the resulting df with df1
bind_rows(df1,.) %>%
# convert the columns to their true types
mutate(datetime=ymd_hms(datetime),Index=as.numeric(Index)) -> result_df
# correct Index order
result_df[order(match(result_df$Index,df2$Index)),] %>%
#filter duplicate Indexes
group_by(Index) %>% mutate(drop=ifelse(row_number() > 1 & is.na(datetime),1,0)) %>%
filter(drop != 1) %>% ungroup() %>%
# add a flag if datetime is NA
group_by(grp = cumsum(!is.na(datetime))) %>%
mutate(NAs = ifelse(is.na(datetime),lag(cumsum(is.na(datetime))) + 1,0)) %>%
ungroup() %>% select(-grp) %>%
# fill the dates by the date that was above if value is na
fill(datetime,.direction="down") %>%
# add the na flag column to the datetime to mimic adding a second if date was at first na
mutate(datetime=datetime + NAs,NAs=NULL,grp=NULL,drop=NULL) -> result_df
#at the end order by datetime
result_df <- result_df[order(result_df$datetime),]