如何为模型预测获得偏差校正的自举置信区间

问题描述

我对模型预测的偏差校正自举置信区间感兴趣。我已经编写了一些代码来执行引导，如下所示：

library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.0.2
#> Warning: package 'tibble' was built under R version 4.0.2
#> Warning: package 'tidyr' was built under R version 4.0.2
#> Warning: package 'dplyr' was built under R version 4.0.2
library(boot)


x = rnorm(25)
eta = -0.5 + 0.2*x
y = rpois(25,exp(eta))
d = tibble(x,y)



predictions<-function(data,ind){
  model = glm(y ~ x,data = data[ind,])
  newdata = tibble(x = seq(-2,2,0.01))
  
  eta = predict(model,newdata = newdata)
  exp(eta)
}


bt = boot(d,predictions,1000)

boot.ci(bt,type = 'bca')
#> BOOTSTRAP CONFIDENCE INTERVAL CALculaTIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = bt,type = "bca")
#> 
#> Intervals : 
#> Level       BCa          
#> 95%   ( 0.654,6.095 )  
#> Calculations and Intervals on Original Scale
#> Some BCa intervals may be unstable

引导程序按预期工作，但是boot.ci调用仅计算单个置信区间。

我想要的输出将是一个具有以下内容的数据框：

用于引导程序预测的一列（本质上是引导程序的平均值）
可信度下限100*(1-a)的一列，
可信度上限100*(1-a)的一列。

如何使用boot获得所需的输出？

解决方法

您可以使用replicate作为引导程序。

predictions <- function(X) {
  model <- glm(y ~ x,data=X)
  newdata <- data.frame(x=seq(-2,2,0.01))
  eta <- predict(model,newdata=newdata)
  exp(eta)
}

set.seed(42)
b <- replicate(200,predictions(d[sample(1:nrow(d),nrow(d),replace=T),]))
res <- cbind(estimate=colMeans(b),matrixStats::colQuantiles(b,probs=c(.025,.975)))
head(res)
#      estimate     2.5%    97.5%
# [1,] 2.913660 1.324121 5.363724
# [2,] 2.432935 1.327170 3.990526
# [3,] 2.608683 1.221961 4.715235
# [4,] 2.681802 1.395430 4.537477
# [5,] 2.592316 1.947494 3.357226
# [6,] 3.705157 1.606632 7.012669

数据：

d <- structure(list(x = c(-0.733089270649064,0.843270552215953,-0.860309462865639,1.57992224055388,0.606460750363077,-1.60987545170912,0.116723009723847,-1.35220761859743,-0.721654356711471,-0.614137997795221,-0.0900374904672829,1.20655915421524,1.46948958927721,-0.210496187862193,-1.21763978187878,-0.566774409165728,2.49263604822452,1.37169581528637,-0.407181458046919,-0.247944038851957,-0.703358091631417,-0.311797310571164,-0.990496073627748,0.515047211912022,0.355009786755435),y = c(0L,1L,0L,2L,3L,2L)),class = "data.frame",row.names = c(NA,-25L))

您可以传递index=参数以获得感兴趣的列：

res = lapply(1:ncol(bt$t),function(i){
ci = boot.ci(bt,index=i,type = 'bca')
data.frame(
var = i,mean = mean(bt$t[,i]),lower = ci$bca[4],upper = ci$bca[5]
)
})

res = do.call(rbind,res)
head(res)
  var     mean     lower    upper
1   1 2.736563 0.9131014 5.768826
2   2 2.729717 0.9155394 5.743819
3   3 2.722903 0.9174191 5.709087
4   4 2.716120 0.9215527 5.706216

在您的示例中不太容易看到，我们可以看到它与虹膜兼容：

bo = boot(iris,function(d,i)colMeans(d[i,-5]),1000)
data.frame(
var=colnames(iris)[1:4],mean = colMeans(bo$t),t(sapply(1:4,function(i)boot.ci(bo,type="bca")$bca[4:5]))
)

           var     mean       X1       X2
1 Sepal.Length 5.847009 5.712667 5.969333
2  Sepal.Width 3.055873 2.982667 3.128428
3 Petal.Length 3.767644 3.487473 4.029977
4  Petal.Width 1.203107 1.083937 1.313888