根据data.table中的长格式日期列,减少宽格式data.table中的cols

问题描述

我有一个宽幅的data.table,如下所示:

library(data.table)
dt_wide <- data.table(
  "id" = seq(1:10),"yw_1001" = trunc( runif(10,100) ),"yw_1002" = trunc( runif(10,"yw_1003" = trunc( runif(10,"yw_1004" = trunc( runif(10,"yw_1005" = trunc( runif(10,"yw_1006" = trunc( runif(10,"yw_1007" = trunc( runif(10,"yw_1008" = trunc( runif(10,"yw_1009" = trunc( runif(10,"yw_1010" = trunc( runif(10,"yw_1011" = trunc( runif(10,"yw_1012" = trunc( runif(10,"yw_1013" = trunc( runif(10,"yw_1014" = trunc( runif(10,"yw_1015" = trunc( runif(10,"yw_1016" = trunc( runif(10,"yw_1017" = trunc( runif(10,"yw_1018" = trunc( runif(10,"yw_1019" = trunc( runif(10,"yw_1020" = trunc( runif(10,"yw_1021" = trunc( runif(10,"yw_1022" = trunc( runif(10,"yw_1023" = trunc( runif(10,"yw_1024" = trunc( runif(10,"yw_1025" = trunc( runif(10,"yw_1026" = trunc( runif(10,"yw_1027" = trunc( runif(10,"yw_1028" = trunc( runif(10,"yw_1029" = trunc( runif(10,"yw_1030" = trunc( runif(10,"yw_1031" = trunc( runif(10,"yw_1032" = trunc( runif(10,"yw_1033" = trunc( runif(10,"yw_1034" = trunc( runif(10,"yw_1035" = trunc( runif(10,"yw_1036" = trunc( runif(10,"yw_1037" = trunc( runif(10,"yw_1038" = trunc( runif(10,"yw_1039" = trunc( runif(10,"yw_1040" = trunc( runif(10,"yw_1041" = trunc( runif(10,"yw_1042" = trunc( runif(10,"yw_1043" = trunc( runif(10,"yw_1044" = trunc( runif(10,"yw_1045" = trunc( runif(10,"yw_1046" = trunc( runif(10,"yw_1047" = trunc( runif(10,"yw_1048" = trunc( runif(10,"yw_1049" = trunc( runif(10,"yw_1050" = trunc( runif(10,"yw_1051" = trunc( runif(10,"yw_1052" = trunc( runif(10,100) )
  )

cols对应于年份(前两位数字)和星期数(最后一位数字)。

在我的实际数据集中(nrow = 5,500,000,ncol = 1400),我不能data.table::melt,因为它会创建超出行数限制的data.table。

实际上,我只需要某些星期数的值。取得以下数据。表

dt2 <- data.table(
  "id" = seq(1:10),"date" = sample(seq(as.Date('2010/01/01'),as.Date('2010/12/31'),by="day"),10)
)

对于每个唯一的dt2,我需要将dt_wide中的值保留在id中的日期之后的5、10和15周。理想情况下,要减小dt_wide的col大小,以便将其转换为长格式。

有什么建议吗?

解决方法

这是一种简单的方法,您可以利用 dplyr,tidyr,stringer和lubridate软件包进行某些突变:

# Calling required libraries
library(data.table)
library(dplyr)

# Creating dataframe
dt_wide <- data.table(
  "id" = seq(1:10),"yw_1001" = trunc( runif(10,100) ),"yw_1002" = trunc( runif(10,"yw_1003" = trunc( runif(10,"yw_1004" = trunc( runif(10,"yw_1005" = trunc( runif(10,"yw_1006" = trunc( runif(10,"yw_1007" = trunc( runif(10,"yw_1008" = trunc( runif(10,"yw_1009" = trunc( runif(10,"yw_1010" = trunc( runif(10,"yw_1011" = trunc( runif(10,"yw_1012" = trunc( runif(10,"yw_1013" = trunc( runif(10,"yw_1014" = trunc( runif(10,"yw_1015" = trunc( runif(10,"yw_1016" = trunc( runif(10,"yw_1017" = trunc( runif(10,"yw_1018" = trunc( runif(10,"yw_1019" = trunc( runif(10,"yw_1020" = trunc( runif(10,"yw_1021" = trunc( runif(10,"yw_1022" = trunc( runif(10,"yw_1023" = trunc( runif(10,"yw_1024" = trunc( runif(10,"yw_1025" = trunc( runif(10,"yw_1026" = trunc( runif(10,"yw_1027" = trunc( runif(10,"yw_1028" = trunc( runif(10,"yw_1029" = trunc( runif(10,"yw_1030" = trunc( runif(10,"yw_1031" = trunc( runif(10,"yw_1032" = trunc( runif(10,"yw_1033" = trunc( runif(10,"yw_1034" = trunc( runif(10,"yw_1035" = trunc( runif(10,"yw_1036" = trunc( runif(10,"yw_1037" = trunc( runif(10,"yw_1038" = trunc( runif(10,"yw_1039" = trunc( runif(10,"yw_1040" = trunc( runif(10,"yw_1041" = trunc( runif(10,"yw_1042" = trunc( runif(10,"yw_1043" = trunc( runif(10,"yw_1044" = trunc( runif(10,"yw_1045" = trunc( runif(10,"yw_1046" = trunc( runif(10,"yw_1047" = trunc( runif(10,"yw_1048" = trunc( runif(10,"yw_1049" = trunc( runif(10,"yw_1050" = trunc( runif(10,"yw_1051" = trunc( runif(10,"yw_1052" = trunc( runif(10,100) )
)

# Creating dataframe with point of interest
dt2 <- data.table(
  "id" = seq(1:10),"date" = sample(seq(as.Date('2010/01/01'),as.Date('2010/12/31'),by="day"),10)
)

# Mutating data to get only required columns
columns_to_select <-
  dt2 %>%
  # Getting dates after 5/10/15 weeks
  mutate(after5 = date + (7 * 5),after10 = date + (7 * 10),after15 = date + (7 * 15)) %>%
  # Converting dates from wide format to long format
  tidyr::gather(key = "key",value = req_date,-c(id,date)) %>%
  # Converting date into respective column name in dt_wide dataframe
  mutate(year = format(as.Date(req_date),"%y"),week = stringr::str_pad(lubridate::week(req_date),2,"left","0"),select_date = paste0("yw_",year,week)) %>%
  # Selecting only required column into a vector
  select(select_date) %>%
  pull()

# Choosing from the wide dataframe only required columns
dt_wide %>%
  select(id,contains(columns_to_select))

# id yw_1024 yw_1044 yw_1017 yw_1014 yw_1045 yw_1031 yw_1035 yw_1029 yw_1049 yw_1022 yw_1019 yw_1050 yw_1036 yw_1040 yw_1034 yw_1027 yw_1041
# 1:  1      59       7      11       7      93      19      83      48      75      94      19       9      93      41       6      26      18
# 2:  2      84      22      18      70      29      53      63      26      23      12      93      84      17      57      96      93      98
# 3:  3       4      72      56      35      65      73      58      91      27      65      58       5      62      13      36      79      26
# 4:  4      36       5      26      56      34      27      60      64      79      27      40      64      32       0      96      56      19
# 5:  5      44      82      78      23      71      78      36      43      63      95      91      37      21      87      63      73      25
# 6:  6      46      45      81      89      59       0      85       3      68      23      90      82      93      42      28      67      32
# 7:  7      56      32       7      26      49      31      79      93      14      45      25      79      39      64      64      86      91
# 8:  8      82      99      46      79      81      56      39      10      20      27      83      29      30      30      35      96      24
# 9:  9      10      87      28      40      51      41      95      43      62      59      44      19      72      76      27      65      36
# 10: 10      81      19      44      55      22      53      98      54      16      29      30      28      20       2       5      39      23
,

这是使用data.table(和日期的lubridate包)的简单解决方案。由于您的问题似乎是由内存限制引起的,因此只需使用必要的列为(小)ID表中的每一行调用melt()。同样要注意,由于某些日期不存在,我们还必须检查dt_wide中是否存在给定日期。

rbindlist(lapply(1:nrow(dt2),function(x) {
  rowid <- dt2[x,id]
  rowdate <- dt2[x,date]
  
  dates <- rowdate + lubridate::weeks(c(5,10,15))
  cols <- paste0("yw_",strftime(dates,"%y%W"))
  
  # Because some dates aren't in the table
  cols <- intersect(cols,colnames(dt_wide))
  if (!length(cols)) return(NULL)
  
  melt(dt_wide[id == rowid,c("id",cols),with = F],id.vars = "id")
}))