问题描述
library(data.table)
dt_wide <- data.table(
"id" = seq(1:10),"yw_1001" = trunc( runif(10,100) ),"yw_1002" = trunc( runif(10,"yw_1003" = trunc( runif(10,"yw_1004" = trunc( runif(10,"yw_1005" = trunc( runif(10,"yw_1006" = trunc( runif(10,"yw_1007" = trunc( runif(10,"yw_1008" = trunc( runif(10,"yw_1009" = trunc( runif(10,"yw_1010" = trunc( runif(10,"yw_1011" = trunc( runif(10,"yw_1012" = trunc( runif(10,"yw_1013" = trunc( runif(10,"yw_1014" = trunc( runif(10,"yw_1015" = trunc( runif(10,"yw_1016" = trunc( runif(10,"yw_1017" = trunc( runif(10,"yw_1018" = trunc( runif(10,"yw_1019" = trunc( runif(10,"yw_1020" = trunc( runif(10,"yw_1021" = trunc( runif(10,"yw_1022" = trunc( runif(10,"yw_1023" = trunc( runif(10,"yw_1024" = trunc( runif(10,"yw_1025" = trunc( runif(10,"yw_1026" = trunc( runif(10,"yw_1027" = trunc( runif(10,"yw_1028" = trunc( runif(10,"yw_1029" = trunc( runif(10,"yw_1030" = trunc( runif(10,"yw_1031" = trunc( runif(10,"yw_1032" = trunc( runif(10,"yw_1033" = trunc( runif(10,"yw_1034" = trunc( runif(10,"yw_1035" = trunc( runif(10,"yw_1036" = trunc( runif(10,"yw_1037" = trunc( runif(10,"yw_1038" = trunc( runif(10,"yw_1039" = trunc( runif(10,"yw_1040" = trunc( runif(10,"yw_1041" = trunc( runif(10,"yw_1042" = trunc( runif(10,"yw_1043" = trunc( runif(10,"yw_1044" = trunc( runif(10,"yw_1045" = trunc( runif(10,"yw_1046" = trunc( runif(10,"yw_1047" = trunc( runif(10,"yw_1048" = trunc( runif(10,"yw_1049" = trunc( runif(10,"yw_1050" = trunc( runif(10,"yw_1051" = trunc( runif(10,"yw_1052" = trunc( runif(10,100) )
)
cols对应于年份(前两位数字)和星期数(最后一位数字)。
在我的实际数据集中(nrow = 5,500,000,ncol = 1400),我不能data.table::melt
,因为它会创建超出行数限制的data.table。
实际上,我只需要某些星期数的值。取得以下数据。表
dt2 <- data.table(
"id" = seq(1:10),"date" = sample(seq(as.Date('2010/01/01'),as.Date('2010/12/31'),by="day"),10)
)
对于每个唯一的dt2
,我需要将dt_wide中的值保留在id
中的日期之后的5、10和15周。理想情况下,要减小dt_wide的col大小,以便将其转换为长格式。
有什么建议吗?
解决方法
这是一种简单的方法,您可以利用 dplyr,tidyr,stringer和lubridate软件包进行某些突变:
# Calling required libraries
library(data.table)
library(dplyr)
# Creating dataframe
dt_wide <- data.table(
"id" = seq(1:10),"yw_1001" = trunc( runif(10,100) ),"yw_1002" = trunc( runif(10,"yw_1003" = trunc( runif(10,"yw_1004" = trunc( runif(10,"yw_1005" = trunc( runif(10,"yw_1006" = trunc( runif(10,"yw_1007" = trunc( runif(10,"yw_1008" = trunc( runif(10,"yw_1009" = trunc( runif(10,"yw_1010" = trunc( runif(10,"yw_1011" = trunc( runif(10,"yw_1012" = trunc( runif(10,"yw_1013" = trunc( runif(10,"yw_1014" = trunc( runif(10,"yw_1015" = trunc( runif(10,"yw_1016" = trunc( runif(10,"yw_1017" = trunc( runif(10,"yw_1018" = trunc( runif(10,"yw_1019" = trunc( runif(10,"yw_1020" = trunc( runif(10,"yw_1021" = trunc( runif(10,"yw_1022" = trunc( runif(10,"yw_1023" = trunc( runif(10,"yw_1024" = trunc( runif(10,"yw_1025" = trunc( runif(10,"yw_1026" = trunc( runif(10,"yw_1027" = trunc( runif(10,"yw_1028" = trunc( runif(10,"yw_1029" = trunc( runif(10,"yw_1030" = trunc( runif(10,"yw_1031" = trunc( runif(10,"yw_1032" = trunc( runif(10,"yw_1033" = trunc( runif(10,"yw_1034" = trunc( runif(10,"yw_1035" = trunc( runif(10,"yw_1036" = trunc( runif(10,"yw_1037" = trunc( runif(10,"yw_1038" = trunc( runif(10,"yw_1039" = trunc( runif(10,"yw_1040" = trunc( runif(10,"yw_1041" = trunc( runif(10,"yw_1042" = trunc( runif(10,"yw_1043" = trunc( runif(10,"yw_1044" = trunc( runif(10,"yw_1045" = trunc( runif(10,"yw_1046" = trunc( runif(10,"yw_1047" = trunc( runif(10,"yw_1048" = trunc( runif(10,"yw_1049" = trunc( runif(10,"yw_1050" = trunc( runif(10,"yw_1051" = trunc( runif(10,"yw_1052" = trunc( runif(10,100) )
)
# Creating dataframe with point of interest
dt2 <- data.table(
"id" = seq(1:10),"date" = sample(seq(as.Date('2010/01/01'),as.Date('2010/12/31'),by="day"),10)
)
# Mutating data to get only required columns
columns_to_select <-
dt2 %>%
# Getting dates after 5/10/15 weeks
mutate(after5 = date + (7 * 5),after10 = date + (7 * 10),after15 = date + (7 * 15)) %>%
# Converting dates from wide format to long format
tidyr::gather(key = "key",value = req_date,-c(id,date)) %>%
# Converting date into respective column name in dt_wide dataframe
mutate(year = format(as.Date(req_date),"%y"),week = stringr::str_pad(lubridate::week(req_date),2,"left","0"),select_date = paste0("yw_",year,week)) %>%
# Selecting only required column into a vector
select(select_date) %>%
pull()
# Choosing from the wide dataframe only required columns
dt_wide %>%
select(id,contains(columns_to_select))
# id yw_1024 yw_1044 yw_1017 yw_1014 yw_1045 yw_1031 yw_1035 yw_1029 yw_1049 yw_1022 yw_1019 yw_1050 yw_1036 yw_1040 yw_1034 yw_1027 yw_1041
# 1: 1 59 7 11 7 93 19 83 48 75 94 19 9 93 41 6 26 18
# 2: 2 84 22 18 70 29 53 63 26 23 12 93 84 17 57 96 93 98
# 3: 3 4 72 56 35 65 73 58 91 27 65 58 5 62 13 36 79 26
# 4: 4 36 5 26 56 34 27 60 64 79 27 40 64 32 0 96 56 19
# 5: 5 44 82 78 23 71 78 36 43 63 95 91 37 21 87 63 73 25
# 6: 6 46 45 81 89 59 0 85 3 68 23 90 82 93 42 28 67 32
# 7: 7 56 32 7 26 49 31 79 93 14 45 25 79 39 64 64 86 91
# 8: 8 82 99 46 79 81 56 39 10 20 27 83 29 30 30 35 96 24
# 9: 9 10 87 28 40 51 41 95 43 62 59 44 19 72 76 27 65 36
# 10: 10 81 19 44 55 22 53 98 54 16 29 30 28 20 2 5 39 23
,
这是使用data.table
(和日期的lubridate
包)的简单解决方案。由于您的问题似乎是由内存限制引起的,因此只需使用必要的列为(小)ID表中的每一行调用melt()
。同样要注意,由于某些日期不存在,我们还必须检查dt_wide
中是否存在给定日期。
rbindlist(lapply(1:nrow(dt2),function(x) {
rowid <- dt2[x,id]
rowdate <- dt2[x,date]
dates <- rowdate + lubridate::weeks(c(5,10,15))
cols <- paste0("yw_",strftime(dates,"%y%W"))
# Because some dates aren't in the table
cols <- intersect(cols,colnames(dt_wide))
if (!length(cols)) return(NULL)
melt(dt_wide[id == rowid,c("id",cols),with = F],id.vars = "id")
}))