从R中的累积概率质量函数矩阵快速随机采样

问题描述

我有一个矩阵（# This is another comment.）代表一个人口普查区mat_cdf在给定日期移至人口普查区i的累积概率。给定一个决定不“待在家”的特工向量，下面我有一个函数j，可以从该矩阵中随机抽样以确定他们将在哪个普查区花费时间。

GetCTMove

与一百万个代理一起工作，每个样本大约需要10秒钟才能运行，乘以许多时间步长会导致每次模拟花费数小时，而此功能到目前为止是模型的速率限制因素。我想知道是否有人对更快实施这种随机抽样有意见。我已经使用# Random generation cts <- 500 i <- rgamma(cts,50,1) prop <- 1:cts # Matrix where rows correspond to probability mass of column integer mat <- do.call(rbind,lapply(i,function(i){dpois(prop,i)})) # Convert to cumulative probability mass mat_cdf <- matrix(NA,cts,cts) for(i in 1:cts){ # Create cdf for row i mat_cdf[i,] <- sapply(1:cts,function(j) sum(mat[i,1:j])) } GetCTMove <- function(agent_cts,ct_mat_cdf){ # Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j mat_expand <- ct_mat_cdf[agent_cts,] # Probabilistically sample column index for every row by generating random number and then determining corresponding closest column s <- runif(length(agent_cts)) fin_col <- max.col(s < mat_expand,"first") return(fin_col) } # Sample of 500,000 agents' residence ct agents <- sample(1:cts,size = 500000,replace = T) # Run function system.time(GetCTMove(agents,mat_cdf)) user system elapsed 3.09 1.19 4.30包来加快随机数的生成，但是与矩阵扩展（dqrng）和mat_expand调用（它们需要花费最长的运行时间）相比，这是相对较小的。>

解决方法

您可以优化的第一件事是以下代码：

max.col(s < mat_expand,"first")

由于s < mat_expand返回逻辑矩阵，因此应用max.col函数与获取每一行的第一个TRUE相同。在这种情况下，使用which会更有效率。另外，如下所示，您将所有CDF存储在一个矩阵中。

mat <- do.call(rbind,lapply(i,function(i){dpois(prop,i)}))
mat_cdf <- matrix(NA,cts,cts)
for(i in 1:cts){
  mat_cdf[i,] <- sapply(1:cts,function(j) sum(mat[i,1:j]))
}

此结构可能不是最佳的。 list结构更适用于应用which之类的功能。由于您不必经过do.call(rbind,...)，因此运行起来也更快。

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i,function(x) cumsum(dpois(prop,x)))

下面是您的实现：

# Implementation 1
GetCTMove <- function(agent_cts,ct_mat_cdf){
  mat_expand <- ct_mat_cdf[agent_cts,]
  s <- runif(length(agent_cts))
  fin_col <- max.col(s < mat_expand,"first")
  return(fin_col)
}

在我的桌面上，大约需要运行2.68秒。

> system.time(GetCTMove(agents,mat_cdf))
   user  system elapsed 
   2.25    0.41    2.68

使用list结构和which函数，运行时间可以减少大约1s。

# Implementation 2
GetCTMove2 <- function(agent_cts,ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}

> system.time(GetCTMove2(agents,ls_cdf))
   user  system elapsed 
   1.59    0.02    1.64

据我所知，只有R，没有其他方法可以进一步加快代码的速度。但是，您确实可以通过在C ++中重写键函数GetCTMove来提高性能。使用Rcpp软件包，您可以执行以下操作：

# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents,NumericVector s,List cdfs) {
  int n = agents.size(); 
  NumericVector out(n); 
  for (int i = 0; i < n; ++i) {
    NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]); 
    int m = cdf.size(); 
    for (int j = 0; j < m; ++j) {
      if (s[i] < cdf[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
GetCTMove3 <- function(agent_cts,ls_cdf){
  s <- runif(length(agent_cts))
  fast_GetCTMove(agent_cts,s,ls_cdf)
}

此实现快如闪电，应该可以满足您的所有需求。

> system.time(GetCTMove3(agents,ls_cdf))
   user  system elapsed 
   0.07    0.00    0.06

完整脚本如下：

# Random generation
cts <- 500
i <- rgamma(cts,50,1)
prop <- 1:cts
agents <- sample(1:cts,size = 500000,replace = T)

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i,x)))
# below is your code
mat <- do.call(rbind,1:j]))
}

# Implementation 1
GetCTMove <- function(agent_cts,"first")
  return(fin_col)
}


# Implementation 2
GetCTMove2 <- function(agent_cts,ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}


# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents,ls_cdf)
}


system.time(GetCTMove(agents,mat_cdf))
system.time(GetCTMove2(agents,ls_cdf))
system.time(GetCTMove3(agents,ls_cdf))