如果行在当前位置尚不存在,请以正确的顺序插入行

问题描述

我有两个数据帧( df1 df2 ),我想以正确的顺序从df2到df1插入所有连续的行(如果它们不存在)在当前位置。 索引列决定是否插入,但是此列不是唯一的,因此它可能在df1或df2中多次出现。我有一些代码,但是对于更大的数据帧来说它非常慢,所以我想问一下是否有一种方法可以更有效地做到这一点。如果运行我的代码,您将看到预期的结果。 预先感谢您的帮助。

df1如下:

df1 <- data.frame(datetime=c("2016-03-02 16:44:32 UTC","2016-03-02 16:51:32 UTC","2016-03-02 16:53:45 UTC","2016-03-02 19:12:15 UTC","2016-03-02 19:12:32 UTC","2016-03-02 19:12:36 UTC","2016-03-02 19:13:50 UTC","2016-03-03 05:44:54 UTC","2016-03-03 05:45:06 UTC","2016-03-03 05:45:11 UTC","2016-03-03 05:45:27 UTC","2016-03-03 05:45:42 UTC","2016-03-03 05:45:52 UTC","2016-03-03 05:45:57 UTC","2016-03-03 05:46:12 UTC","2016-03-03 05:46:23 UTC","2016-03-03 05:46:29 UTC","2016-03-03 05:46:45 UTC","2016-03-03 05:47:03 UTC","2016-03-03 05:47:19 UTC","2016-03-03 05:47:37 UTC","2016-03-03 05:47:51 UTC","2016-03-03 05:47:56 UTC","2016-03-03 05:48:03 UTC","2016-03-03 05:48:04 UTC","2016-03-03 05:48:10 UTC","2016-03-03 05:48:18 UTC","2016-03-03 05:48:27 UTC","2016-03-03 05:48:45 UTC"),Index=c(rep(NA,7),"68362","68364","68352","427292","255720","255721","255713"))

df2如下:

 df2 <- data.frame(Index=c("68362","68363","68365","68351","68373","68372","68371","255713"))

我的代码如下:

library(lubridate)

insertFromDF2 <- function(df_1,df_2) {
  #datetime is in wrong format
  df_1$datetime <- parse_date_time(df_1$datetime,orders="ymd HMS")
  #iteration index will be used to access df_1
  iteration <- 1
  #help var for storing seconds
  last_date <- 1
  temp <- data.frame()
  #list used to store all inserted rows
  l <- list()
  #list index will be used to access l
  l_index <- 1
  newR <- data.frame(datetime=NA,Index=NA)
  #loop through the df_2 and check if row with current Index exists in df_1
  for (i in 1:nrow(df_2)) { 
    #skip NA rows in df1
    while(is.na(df_1$Index[iteration])) {
      iteration <- iteration + 1
    }
    #skip if rows with current Index exist in df_1
    if(df_1$Index[iteration] == df_2$Index[i]) {
      while(df_1$Index[iteration] == df_2$Index[i] & iteration < nrow(df_1)) {
        iteration <- iteration + 1
      }
      last_date <- 1
      next
    }
    
    #I used datetime as a help column for correct order,I used time from the prevIoUs row and add 1 second
    newR$datetime <- df_1$datetime[iteration - 1] + lubridate::seconds(last_date)
    #copy columns from df_2 to new row,which will be inserted do df_1
    newR[2] <- df_2$Index[i]
    #increment help var for storing seconds,which will be used in next iteration
    last_date <- last_date + 1
    #store new row in list
    l[[l_index]] <- newR
    l_index <- l_index + 1
  }
  
  #at the end call rbind to combine list with df_1
  temp <- do.call("rbind",l)
  #highlight inserted rows
  temp$inserted <- 1
  df_1$inserted <- 0
  df_1 <- rbind(temp,df_1) 
  #order by datetime
  df_1 <- df_1[order(df_1$datetime),]
  
  return (df_1)   
}

#make some magic
df <- insertFromDF2(df1,df2)

解决方法

我尝试运行您的代码,但是说rbind找到不同的列名却出错,但是我在这里给出了答案:

df2  %>%
 # use this if you want to filter out the indices that are found in df1
 # remove if you want to keep all the rows
 filter(!Index %in% df1$Index) %>%
 # bind the resulting df with df1
  bind_rows(df1,.) %>%
 # convert the columns to their true types
   mutate(datetime= ymd_hms(datetime),Index=as.numeric(Index)) %>%
 # arrange by index and datetime the tibble step is important somehow a simple data.frame forgets the types
    arrange(Index,datetime) %>%
 # add a flag if date is NA
     mutate(nas = is.na(datetime)) %>%
 # fill the dates by the date that was above if value is na
      fill(datetime,.direction="downup") %>% group_by(datetime) %>%
 # add the cumsum of na flag column to the datetime to mimic adding n seconds if date was at first na
      mutate(datetime=datetime + cumsum(nas),nas=NULL) %>% ungroup %>% arrange(datetime) 
# A tibble: 35 x 5
   datetime            Index someColumn1 someColumn2 nas  
   <dttm>              <dbl> <chr>       <chr>       <lgl>
 1 2016-03-02 16:44:32    NA Anything    Anything    FALSE
 2 2016-03-02 16:51:32    NA Anything    Anything    FALSE
 3 2016-03-02 16:53:45    NA Anything    Anything    FALSE
 4 2016-03-02 19:12:15    NA Anything    Anything    FALSE
 5 2016-03-02 19:12:32    NA Anything    Anything    FALSE
 6 2016-03-02 19:12:36    NA Anything    Anything    FALSE
 7 2016-03-02 19:13:50    NA Anything    Anything    FALSE
 8 2016-03-03 05:44:54 68362 Anything    Anything    FALSE
 9 2016-03-03 05:44:55 68363 Anything    Anything    TRUE 
10 2016-03-03 05:45:06 68364 Anything    Anything    FALSE
# … with 25 more rows

数据

问题中提供的相同数据,但设置为stringsAsFactors = FALSE


df2 <- data.frame(Index=c("68362","68363","68364","68365","68352","68351","68373","68372","68371","427292","255720","255721","255713"),someColumn1=rep("Anything",13),someColumn2=rep("Anything",stringsAsFactors = F)

df1 <- data.frame(datetime=c("2016-03-02 16:44:32 UTC","2016-03-02 16:51:32 UTC","2016-03-02 16:53:45 UTC","2016-03-02 19:12:15 UTC","2016-03-02 19:12:32 UTC","2016-03-02 19:12:36 UTC","2016-03-02 19:13:50 UTC","2016-03-03 05:44:54 UTC","2016-03-03 05:45:06 UTC","2016-03-03 05:45:11 UTC","2016-03-03 05:45:27 UTC","2016-03-03 05:45:42 UTC","2016-03-03 05:45:52 UTC","2016-03-03 05:45:57 UTC","2016-03-03 05:46:12 UTC","2016-03-03 05:46:23 UTC","2016-03-03 05:46:29 UTC","2016-03-03 05:46:45 UTC","2016-03-03 05:47:03 UTC","2016-03-03 05:47:19 UTC","2016-03-03 05:47:37 UTC","2016-03-03 05:47:51 UTC","2016-03-03 05:47:56 UTC","2016-03-03 05:48:03 UTC","2016-03-03 05:48:04 UTC","2016-03-03 05:48:10 UTC","2016-03-03 05:48:18 UTC","2016-03-03 05:48:27 UTC","2016-03-03 05:48:45 UTC"),Index=c(rep(NA,7),"68362",29),stringsAsFactors = F)
,

我找到了一个解决方案,它基于@Abdessabour Mtk的解决方案,并进行了一些改进。我的代码考虑到索引列不是唯一的,因此它可能在df1中出现多次,并且插入的行具有正确的日期时间。

df2  %>%
    # bind the resulting df with df1
    bind_rows(df1,.) %>%
    # convert the columns to their true types
    mutate(datetime=ymd_hms(datetime),Index=as.numeric(Index)) -> result_df 
    
  # correct Index order
  result_df[order(match(result_df$Index,df2$Index)),] %>% 
    #filter duplicate Indexes 
    group_by(Index) %>% mutate(drop=ifelse(row_number() > 1 & is.na(datetime),1,0)) %>% 
    filter(drop != 1) %>% ungroup() %>%
    # add a flag if datetime is NA
    group_by(grp = cumsum(!is.na(datetime))) %>%
    mutate(NAs = ifelse(is.na(datetime),lag(cumsum(is.na(datetime))) + 1,0)) %>% 
    ungroup() %>% select(-grp) %>% 
    # fill the dates by the date that was above if value is na
    fill(datetime,.direction="down") %>%
    # add the na flag column to the datetime to mimic adding a second if date was at first na
    mutate(datetime=datetime + NAs,NAs=NULL,grp=NULL,drop=NULL) -> result_df
  
  #at the end order by datetime 
  result_df <- result_df[order(result_df$datetime),]