R - 在日期序列之间扩展一个值并作为列添加到 data.table 数据

问题描述

更新: akrun 提供的建议解决方案适用于我但是,我的问题是 value.var = rating 中定义的值结转到相应的日期列。请注意,所有定义为 rating_DATEVALID_THRU_DATE间的时间段的月份都没有填充。

到目前为止我尝试过但失败的方法 而不是像这样定义 dcast 操作

dt1 <- dcast(setDT(ratings.dt),ISSUE_ID + rating_TYPE ~ rating_DATE,value.var = 'rating')

我确实尝试过

  dt1 <- dcast(setDT(ratings.dt),ISSUE_ID + rating_TYPE ~ (VALID_THRU_DATE - rating_DATE),value.var = 'rating')      


 dt1 <- dcast(setDT(ratings.dt),ISSUE_ID + rating_TYPE ~ as.yearmon(seq(
             rating_DATE,VALID_THRU_DATE),frac = 1),value.var = 'rating')


dt1 <- dcast(setDT(ratings.dt),ISSUE_ID + rating_TYPE ~ (rating_DATE:VALID_THRU_DATE),value.var = 'rating')

我认为我可以只使用定义每个评级有效期的 2 列,因为这两个列都是 dcast() 函数调用中的日期列,但显然该任务背后的逻辑在概念化方面更加复杂。>

现在我通过首先构建一个“骨架数据表”来手动概念化这个任务,然后通过逐行循环遍历长格式的原始评级数据表并将定义的评级分布在两个日期之间骨架表。 (我将 rating 重命名rating_NUM 以区别于“原始”字母数字评级)

# (0) Filter only the most recent rating within a given month
ratings_num.dt <- ratings_num.dt[,.SD[.N],by = .(ISSUE_ID,rating_TYPE,rating_DATE)] 

# (1) Defining start and end date for the rating time series
start_date    <- as.Date("1990-01-01","%Y-%m-%d")
end_date      <- as.Date("2021-01-31","%Y-%m-%d")

# (2) Define the dates as new columns for a skeleton data.table
new_cols      <- seq(from = start_date,to = end_date,by = "month")
new_cols      <- date_ymd_to_m_end(new_cols)
new_col_names <- as.character(new_cols,"%Y-%m-%d")

# (3) Determine how many months the rating time series spans 
N_months <- elapsed_months_lubri(start_date,end_date) + 1 
            # some function to do just what the name implies

MONTH_ID <- c(1:N_months)

# (4) Define the layout of the new skeleton table
# Note: The new table should contain the 3 rows per issue ID,namely the rating time series of each issue ID for every considered rating ageny 

rating_type.vec <- c("FR","MR","SPR")    

df_skeleton <- data.frame(rep(issue_IDs.vec,each = 3),rating_type.vec)

someInitialValue <- 0

# Credit to Jonas
to_Add <- setNames(data.frame(matrix(rep(
            someInitialValue,nrow(df_skeleton)*length(new_col_names)),ncol = length(new_col_names),nrow = NROW(df))),new_col_names)

ratings_num_ts.df <- cbind(df_skeleton,to_Add)
ratings_num_ts.dt <- setDT(ratings_num_ts.df)

setnames(ratings_num_ts.dt,c("rep.issue_IDs.vec..each...3.","rating_type.vec"),c("ISSUE_ID","rating_TYPE"))

# (5) Create a data.table to join on ratings_num.dt to add month IDs to use for assigning ratings

seq_dates.dt <- setDT(data.frame(new_cols,MONTH_ID))
seq_dates.dt <- setnames(seq_dates.dt,c("new_cols"),c("rating_DATE"))

ratings_num.dt <- ratings_num.dt[seq_dates.dt,on = .(rating_DATE = rating_DATE)]

ratings_num.dt <- ratings_num.dt[seq_dates.dt,on = .(rating_VAL_THRU = rating_DATE)]

# (6) If for the joined MONTH_IDs there is no corresponding rating_DATE or rating_VAL_THRU entry,the join will write NA values for these values in the joined table and can be filtered out accordingly

ratings_num.dt <- ratings_num.dt[!is.na(ISSUE_ID)]

# (7) Rename column of second MONTH_ID
setnames(ratings_num.dt,c("MONTH_ID","i.MONTH_ID"),c("MONTH_ID_START","MONTH_ID_END"))

# (8) Sort table by setting keys 
setkey(ratings_num.dt,ISSUE_ID,rating_DATE)

# (9) Defining logic as loop 
tic()

i <- 1
j <- nrow(ratings_num.dt)
  
id.vec             <- ratings_num.dt[,ISSUE_ID] 
rating_type.vec    <- ratings_num.dt[,rating_TYPE]
month_ID_start.vec <- (ratings_num.dt[,MONTH_ID_START] + 2)  
month_ID_end.vec   <- (ratings_num.dt[,MONTH_ID_END] + 2)
rating_num.vec     <- ratings_num.dt[,rating_NUM]

total <- j
pb <- progress_bar$new(format = "[:bar] :current/:total 
                        (:percent) eta: :eta",total = total)

  
spread_ratings_to_ts <- function(dt_source,dt_ts) {
  pb$tick(0)
  for (i in 1:j) {
    id             <- id.vec[i]  # alternatively ROW_ID == i
    rating_type    <- rating_type.vec[i]
    month_ID_start <- month_ID_start.vec[i]  # change to right value
    month_ID_end   <- month_ID_end.vec[i]
    rating_num     <- rating_num.vec[i]
    
    dt_ts[ISSUE_ID == id & rating_TYPE == rating_type,(month_ID_start:month_ID_end) := rating_num]
    
    if (i %% 50 == 0) {
      pb$tick()
    }  
    
    i <- i + 1
  }
}

spread_ratings_to_ts(ratings_num.dt,ratings_num_ts.dt)

toc() 
## ~ 3,600 sec for ~ 250k rows to loop through ##


# (10) Compute rating means
# Substitute all pre-filled zeros in the table with NA as there is simply no 
# rating available at this point in time

ratings_num_ts.dt <- ratings_num_ts.dt %>% 
                       na_if(0)
ratings_num_ts.dt <- rbind(ratings_num_ts.dt,ratings_num_ts.dt[,c(.(rating_TYPE = 'Mean'),lapply(.SD,mean,na.rm=TRUE)),by = .(ISSUE_ID),.SDcols = -(1:2)])

setkey(ratings_num_ts.dt,rating_TYPE)

我尝试使用 foreach(...) %dopar% function(...) 并行化这个循环,就像你在下面看到的那样,但它现在不起作用。这主要是由上面非常低效的循环的运行时驱动的 - 尽管工作得很好并完成了我想要的。在处理 foreach 函数调用时,我特别不确定如何编写一个合适的组合函数,我可以将其放入 foreach 调用中,以便根据需要包装结果。

i <- 1
j <- nrow(ratings_num.dt)

id.vec             <- ratings_num.dt[,ISSUE_ID]
rating_type.vec    <- ratings_num.dt[,rating_TYPE]

# col 1+2 not rating but ISSUE_ID and rating_TYPE
month_ID_start.vec <- (ratings_num.dt[,MONTH_ID_START] + 2) 
month_ID_end.vec   <- (ratings_num.dt[,rating_NUM]

spread_ratings_to_ts <- function(dt_source,dt_ts) {
  id             <- id.vec[i]
  rating_type    <- rating_type.vec[i]
  month_ID_start <- month_ID_start.vec[i]
  month_ID_end   <- month_ID_end.vec[i]
  rating_num     <- rating_num.vec[i]
  
  dt_ts[ISSUE_ID == id & rating_TYPE == rating_type][,(month_ID_start:month_ID_end) := rating_num]
}   

myCluster <- makeCluster(((detectCores()/2) - 1),type = "PSOCK")
registerDoParallel(myCluster)

clusterEvalQ(cl = myCluster,{
  setMKLthreads(1)
})

foreach(i = 1:j,.combine = 'rbind') %dopar% 
    spread_ratings_to_ts(dt_source = ratings_num.dt,dt_ts = ratings_num_ts.dt)

stopCluster(myCluster)

背景/数据: 理论上这很容易,即使是 3 岁的孩子也可以手动完成这项任务,但即使在解决这个问题近一周后,我也没有进一步的解决方案。

问题: 我正在处理一个大型金融数据集。它包含由 ISSUE_ID 及其对应的 rating 标识的债券问题,该 rating_TYPE 由 3 家评级机构惠誉、穆迪和标准普尔提供,定义为 rating_DATE。我为每个评级确定了一个发布日期和一个有效通过日期,定义为 VALID_THRU_DATEDATE,两者都是 ISSUE_ID 类型。所有日期都由 yearmonth() 格式化为给定月份的最后一天,因为它们的评级用于确定索引包含,其规则在月底评估。

numeric 的类型为 rating

character 的类型为 rating_TYPE

character 的类型为 ratings.dt

我的数据设置为名为 ISSUE_ID rating_TYPE rating rating_DATE VALID_THRU_DATE rating_DATE_SEQ 123 FR 3.33 2000-01-31 2000-04-31 1 123 FR 4.00 2000-05-31 2000-02-28 2 123 FR 3.66 2001-03-31 2001-04-31 3 123 FR 2.00 2001-05-31 2001-04-30 4 123 FR 2.33 2001-04-30 2003-12-31 5 123 FR 3.00 2004-01-31 2004-06-30 6 123 MR 2.33 1999-04-31 1999-12-31 1 123 MR 2.66 2000-01-31 2000-04-31 2 123 MR 3.00 2001-03-31 2001-04-30 3 123 MR 3.33 2001-05-31 2003-01-31 4 123 MR 3.00 2003-02-28 2003-07-31 5 123 SP 3.33 1999-04-31 2002-03-31 1 123 SP 3.00 2002-04-31 2003-05-31 2 244 ... 的 data.table,我需要向其中添加开始日期和结束日期之间序列的列。然后,我的目标是为每个问题 ID 设置 3 行,每一行用于每个评级机构各自评级历史的时间序列。

data.table 的键设置为 ISSUE_ID、rating_TYPE 和 rating_DATE。

现在的数据如下所示:

rating

现在我想基本上将 ISSUE_ID rating_TYPE 1999-04-30 1999-05-31 ... 2000-01-31 2000-02-28 ... 2004-06-30 123 FR ... 3.33 2.33 ... 3.00 123 MR 2.33 2.33 ... 2.66 2.66 ... 123 SP 3.33 3.33 ... 3.33 2.66 ... 244 ... 中定义的评级分布在一系列日期中。 我想像这样:

 ISSUE_ID  rating_TYPE   1999-04-30  1999-05-31  ...   2000-01-31  2000-02-28    ...  2004-06-30 
   123       FR                                  ...      3.33         2.33      ...     3.00
   123       MR            2.33         2.33     ...      2.66         2.66      ...
   123       SP            3.33         3.33     ...      3.33         2.66      ...
   123      Mean           2.83         2.83     ...      3.11         2.55      ... 

这样我才能做到:

ratings.dt[,mean),.SDcols = x:y,# col indexes of added date sequence columns
           by = .(ISSUE_ID)]

然后我可以通过这样的 data.table 语法计算每个问题 ID 每月的评分平均值

final List<String> actions = ["Abrir","Alzar","Enseñar","Sentar","Mirar"];

使用我的映射表将字母数字评级(例如 AAA、B+、C- 等)转换为数值以允许基于数字的算术计算(例如平均值),我可以将数字评级平均值转换回字母数字评级.那就意味着任务完成了!

另外,我现在不确定这个问题是否可以更有效地概念化。希望得到任何指点!

解决方法

我们用 pivot_wider 转换宽格式,按 summarise 进行分组,通过将其他观察值与 mean 值连接来创建“均值”行。使用 dplyr version >=1.0summarise 可以为每组返回多于一行

library(dplyr)
library(tidyr)
ratings.dt %>%
     select(-VALID_THRU_DATE,-RATING_DATE_SEQ) %>% 
     pivot_wider(names_from = RATING_DATE,values_from = RATING) %>% 
     group_by(ISSUE_ID) %>% 
     summarise(RATING_TYPE = c(RATING_TYPE,"Mean"),across(where(is.numeric),~ c(.,mean(.,na.rm = TRUE))),.groups = 'drop')

-输出

# A tibble: 4 x 11
#  ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
#     <int> <chr>              <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
#1      123 FR                  3.33            4         3.66         2            2.33            3        NA   
#2      123 MR                  2.66           NA         3            3.33        NA              NA         2.33
#3      123 SP                 NA              NA        NA           NA           NA              NA         3.33
#4      123 Mean                3.00            4         3.33         2.66         2.33            3         2.83
# … with 2 more variables: `2003-02-28` <dbl>,`2002-04-31` <dbl>
 

或者使用 data.table

library(data.table)
dt1 <- dcast(setDT(ratings.dt),ISSUE_ID + RATING_TYPE ~ RATING_DATE,value.var = 'RATING')
rbind(dt1,dt1[,c(.(RATING_TYPE = 'Mean'),lapply(.SD,mean,na.rm = TRUE)),.(ISSUE_ID),.SDcols = -(1:2)])
#   ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1:      123          FR         NA      3.330          4       3.66       2.33      2.000         NA         NA
#2:      123          MR       2.33      2.660         NA       3.00         NA      3.330         NA          3
#3:      123          SP       3.33         NA         NA         NA         NA         NA          3         NA
#4:      123        Mean       2.83      2.995          4       3.33       2.33      2.665          3          3
#   2004-01-31
#1:          3
#2:         NA
#3:         NA
#4:          3

数据

ratings.dt <- structure(list(ISSUE_ID = c(123L,123L,123L),RATING_TYPE = c("FR","FR","MR","SP","SP"),RATING = c(3.33,4,3.66,2,2.33,3,2.66,3.33,3),RATING_DATE = c("2000-01-31","2000-05-31","2001-03-31","2001-05-31","2001-04-30","2004-01-31","1999-04-31","2000-01-31","2003-02-28","2002-04-31"
),VALID_THRU_DATE = c("2000-04-31","2000-02-28","2001-04-31","2003-12-31","2004-06-30","1999-12-31","2000-04-31","2003-01-31","2003-07-31","2002-03-31","2003-05-31"
),RATING_DATE_SEQ = c(1L,2L,3L,4L,5L,6L,1L,2L)),class = "data.frame",row.names = c(NA,-13L))