计算分布拟合函数 R

问题描述

我正在为不同的分布函数绘制曲线，我需要知道每条曲线的最高 y 值。稍后我将只绘制一条曲线，它被选为最佳拟合。

这是函数（它有点硬编码，我正在研究它）：

library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
        
        
fdistr <- function(d) {
  
  #  Uncomment to try  run line by line
  # d <- data_to_plot
  
  TLT <- d$TLT
  if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
  gev <- fgev(TLT,std.err=FALSE)
  distr <- c('norm','lnorm','weibull','gamma')
  fit <- lapply(X=distr,FUN=fitdist,data=TLT)
  fit[[5]] <- gev
  distr[5] <- 'gev'
  names(fit) <- distr
  Loglike <- sapply(X=fit,FUN=logLik)
  Loglike_Best <- which(Loglike == max(Loglike))
  
  #  Uncomment to try  run line by line
  # max <- which.max(density(d$TLT)$y)
  # max_density <- stats::density(d$TLT)$y[max]
  # max_y <- max_density
  
  x_data <- max(d$TLT)
  
  hist(TLT,prob=TRUE,breaks= x_data,main=paste(d$DLT_Code[1],'- best :',names(Loglike[Loglike_Best])),sub = 'Total Lead Times',col='lightgrey',border='white'
       # ylim=  c(0,max_y)
  )
  
  lines(density(TLT),col='darkgrey',lty=2,lwd=2)
  
  grid(nx = NA,ny = NULL,col = "gray",lty = "dotted",lwd = .5,equilogs = TRUE)
  
  curve(dnorm(x,mean=fit[['norm']]$estimate[1],sd=fit[['norm']]$estimate[2]),add=TRUE,col='blue',lwd=2)
  
  curve(dlnorm(x,meanlog=fit[['lnorm']]$estimate[1],sdlog=fit[['lnorm']]$estimate[2]),col='darkgreen',lwd=2)
  
  curve(dweibull(x,shape=fit[['weibull']]$estimate[1],scale=fit[['weibull']]$estimate[2]),col='purple',lwd=2)
  
  curve(dgamma(x,shape=fit[['gamma']]$estimate[1],rate=fit[['gamma']]$estimate[2]),col='Gold',lwd=2)
  
  
  curve(dgev(x,loc=fit[['gev']]$estimate[1],scale=fit[['gev']]$estimate[2],shape=fit[['gev']]$estimate[3]),col='red',lwd=2)
  
  
  legend_loglik <- paste(c('norm','Lognorm','Weibull','Gamma','GEV'),c(':'),round(Loglike,digits=2))
  
  legend("topright",legend=legend_loglik,col=c('blue','darkgreen','purple','gold','red'),lty=1,lwd=2,bty='o',bg='white',Box.lty=2,Box.lwd = 1,Box.col='white')  
  
  return(data.frame(DLT_Code = d$DLT_Code[1],n = length(d$TLT),Best = names(Loglike[Loglike_Best]),lnorm = Loglike[1],norm = Loglike[2],weibul = Loglike[3],gamma = Loglike[4],GEV = Loglike[5]))
  
}



#  Creating data set
TLT <- c(rep(0,32),rep(1,120),rep(2,10),rep(3,67),rep(4,14),rep(5,7),6)
DLT_Code <- c(rep('DLT_Code',251))

data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))


DLT_distr <- do.call(rbind,by(data = data_to_plot,INDICES = data_to_plot$DLT_Code,FUN=fdistr))

我尝试使用 max_y，然后在 ylim 中使用它。我只能为正常密度做，但不能为其余曲线做。

目前的情节是这样的（一些曲线被剪掉了）：

如果设置 ylim = c(0,2)，我们可以看到，对数正态分布和伽马分布超过 1：

我需要知道每条曲线的最大值，因此，当我选择要打印的曲线时，要设置正确的 ylim。

解决方法

您可以使用 purrr::map_dbl 将函数 optimize 映射到您的密度上，如果您稍微重新排列您的代码并且您知道要找到它们的最大值/密度存在的输入值。

您可以使用任何参数提前设置密度，这样您就可以使用 optimize 找到它们的峰值并将它们传递给 curve 函数。

作为一个可重现的小例子：

library(purrr)

# parameterize your densities
mynorm <- function(x) dnorm(x,mean = 0,sd = 1) 
mygamma <- function(x) dgamma(x,rate = .5,shape = 1) 

# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm,mygamma),~ optimize(.,interval = c(0,3),maximum = T)$objective))

# 0.4999811

# plot data
curve(mynorm,col = "blue",lwd = 2,xlim = c(0,ylim = c(0,ymax * 1.1))
curve(mygamma,col = "red",add = T)

使用您的代码我已经实现了上述解决方案并调整了 x 函数的 curve 网格，以便在我们在评论中讨论后向您展示我的意思，使事情更清楚并向您展示您实际应该绘制的内容：

library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library


fdistr <- function(d) {
  
  #  Uncomment to try  run line by line
  # d <- data_to_plot
  
  TLT <- d$TLT
  if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
  gev <- fgev(TLT,std.err=FALSE)
  distr <- c('norm','lnorm','weibull','gamma')
  fit <- lapply(X=distr,FUN=fitdist,data=TLT)
  fit[[5]] <- gev
  distr[5] <- 'gev'
  names(fit) <- distr
  Loglike <- sapply(X=fit,FUN=logLik)
  Loglike_Best <- which(Loglike == max(Loglike))
  
  #  Uncomment to try  run line by line
  # max <- which.max(density(d$TLT)$y)
  # max_density <- stats::density(d$TLT)$y[max]
  # max_y <- max_density
  
  x_data <- max(d$TLT)
  
  # parameterize your densities before plotting
  mynorm <- function(x) {
    dnorm(x,mean=fit[['norm']]$estimate[1],sd=fit[['norm']]$estimate[2])
  }
  
  mylnorm <- function(x){
    dlnorm(x,meanlog=fit[['lnorm']]$estimate[1],sdlog=fit[['lnorm']]$estimate[2])
  }
  
  myweibull <- function(x) {
    dweibull(x,shape=fit[['weibull']]$estimate[1],scale=fit[['weibull']]$estimate[2])
  }
  
  mygamma <- function(x) {
    dgamma(x,shape=fit[['gamma']]$estimate[1],rate=fit[['gamma']]$estimate[2])
  }
  
  mygev <- function(x){
    dgev(x,loc=fit[['gev']]$estimate[1],scale=fit[['gev']]$estimate[2],shape=fit[['gev']]$estimate[3])
  }
  
  distributions <- c(mynorm,mylnorm,myweibull,mygamma,mygev)
  
  # get the max of each density
  y <- purrr::map_dbl(distributions,x_data),maximum = T)$objective)

  # find the max (excluding infinity)
  ymax <- max(y[abs(y) < Inf])
  
  
  hist(TLT,prob=TRUE,breaks= x_data,main=paste(d$DLT_Code[1],'- best :',names(Loglike[Loglike_Best])),sub = 'Total Lead Times',col='lightgrey',border='white',ylim=  c(0,ymax)
  )
  
  lines(density(TLT),col='darkgrey',lty=2,lwd=2)
  
  grid(nx = NA,ny = NULL,col = "gray",lty = "dotted",lwd = .5,equilogs = TRUE)
  
  curve(mynorm,add=TRUE,col='blue',lwd=2,n = 1E5) # <- increase x grid
  
  curve(mylnorm,col='darkgreen',n = 1E5) # <- increase x grid
  
  curve(myweibull,col='purple',n = 1E5) # <- increase x grid
  
  curve(mygamma,col='Gold',n = 1E5) # <- increase x grid
  
  
  curve(mygev,col='red',n = 1E5) # <- increase x grid
  
  
  legend_loglik <- paste(c('Norm','LogNorm','Weibull','Gamma','GEV'),c(':'),round(Loglike,digits=2))
  
  legend("topright",legend=legend_loglik,col=c('blue','darkgreen','purple','gold','red'),lty=1,bty='o',bg='white',box.lty=2,box.lwd = 1,box.col='white')  
  
  return(data.frame(DLT_Code = d$DLT_Code[1],n = length(d$TLT),Best = names(Loglike[Loglike_Best]),lnorm = Loglike[1],norm = Loglike[2],weibul = Loglike[3],gamma = Loglike[4],GEV = Loglike[5]))
  
}



#  Creating data set
TLT <- c(rep(0,32),rep(1,120),rep(2,10),rep(3,67),rep(4,14),rep(5,7),6)
DLT_Code <- c(rep('DLT_Code',251))

data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))


DLT_Distr <- do.call(rbind,by(data = data_to_plot,INDICES = data_to_plot$DLT_Code,FUN=fdistr))

为什么您的绘图高度与解决方案输出不匹配

为了进一步说明您的绘图发生了什么以及您可能遇到的一些困惑，您需要了解 curve 函数如何绘制您的数据。默认情况下，curve 接受 101 个 x 值并评估您的函数以获得它们的 y 值，然后将这些点绘制为一条线。由于某些密度的峰值非常尖锐，因此 curve 函数没有评估足够的 x 值来绘制密度峰值。为了表明你想要我的意思是我将专注于你的伽马密度。不要像输出一样担心代码。下面我有 n 的不同值的前几个 (x,y) 坐标。

library(purrr)

mygamma <- function(x) {
  dgamma(x,# 0.6225622
         rate=fit[['gamma']]$estimate[2]) # 0.3568242
}

number_of_x <- c(5,10,101,75000)
purrr::imap_dfr(number_of_x,~ curve(mygamma,6),n = .),.id = "n") %>% 
  dplyr::mutate_at(1,~ sprintf("n = %i",number_of_x[as.numeric(.)])) %>% 
  dplyr::mutate(n = factor(n,unique(n))) %>% 
  dplyr::filter(x > 0) %>% 
  dplyr::group_by(n) %>% 
  dplyr::slice_min(order_by = x,n = 5)

 n                 x       y
   <fct>         <dbl>   <dbl>
 1 n = 5     1.5        0.184 
 2 n = 5     3          0.0828
 3 n = 5     4.5        0.0416
 4 n = 5     6          0.0219
 5 n = 10    0.667      0.336 
 6 n = 10    1.33       0.204 
 7 n = 10    2          0.138 
 8 n = 10    2.67       0.0975
 9 n = 10    3.33       0.0707
10 n = 101   0.06       1.04  
11 n = 101   0.12       0.780 
12 n = 101   0.18       0.655 
13 n = 101   0.24       0.575 
14 n = 101   0.3        0.518 
15 n = 75000 0.0000800 12.9   
16 n = 75000 0.000160   9.90  
17 n = 75000 0.000240   8.50  
18 n = 75000 0.000320   7.62  
19 n = 75000 0.000400   7.01

请注意，当 n = 5 时，您绘制的值很少。随着 n 的增加，x 值之间的距离变小。由于这些函数是连续的，因此要绘制无限数量的点，但这无法通过计算完成，因此绘制了 x 值的子集以进行近似。 x 值越多，近似效果越好。通常，默认的 n = 101 工作正常，但由于伽玛和对数正态密度具有如此尖锐的峰值，绘图函数会越过最大值。下面是添加了点的 n = 5,75000 数据的完整图。

最后我使用了这个解决方案，找到了here：

mygamma <- function(x) dgamma(x,rate=fit[['gamma']]$estimate[2]) 
get_curve_values <- function(fn,x_data){
res <- curve(fn,from=0,to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma,x_data)
ylim <- max(curve_val$y,na.rm = TRUE)

curve distribution distribution r r