问题描述
我正在为不同的分布函数绘制曲线,我需要知道每条曲线的最高 y 值。稍后我将只绘制一条曲线,它被选为最佳拟合。
这是函数(它有点硬编码,我正在研究它):
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT,std.err=FALSE)
distr <- c('norm','lnorm','weibull','gamma')
fit <- lapply(X=distr,FUN=fitdist,data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit,FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
hist(TLT,prob=TRUE,breaks= x_data,main=paste(d$DLT_Code[1],'- best :',names(Loglike[Loglike_Best])),sub = 'Total Lead Times',col='lightgrey',border='white'
# ylim= c(0,max_y)
)
lines(density(TLT),col='darkgrey',lty=2,lwd=2)
grid(nx = NA,ny = NULL,col = "gray",lty = "dotted",lwd = .5,equilogs = TRUE)
curve(dnorm(x,mean=fit[['norm']]$estimate[1],sd=fit[['norm']]$estimate[2]),add=TRUE,col='blue',lwd=2)
curve(dlnorm(x,meanlog=fit[['lnorm']]$estimate[1],sdlog=fit[['lnorm']]$estimate[2]),col='darkgreen',lwd=2)
curve(dweibull(x,shape=fit[['weibull']]$estimate[1],scale=fit[['weibull']]$estimate[2]),col='purple',lwd=2)
curve(dgamma(x,shape=fit[['gamma']]$estimate[1],rate=fit[['gamma']]$estimate[2]),col='Gold',lwd=2)
curve(dgev(x,loc=fit[['gev']]$estimate[1],scale=fit[['gev']]$estimate[2],shape=fit[['gev']]$estimate[3]),col='red',lwd=2)
legend_loglik <- paste(c('norm','Lognorm','Weibull','Gamma','GEV'),c(':'),round(Loglike,digits=2))
legend("topright",legend=legend_loglik,col=c('blue','darkgreen','purple','gold','red'),lty=1,lwd=2,bty='o',bg='white',Box.lty=2,Box.lwd = 1,Box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],n = length(d$TLT),Best = names(Loglike[Loglike_Best]),lnorm = Loglike[1],norm = Loglike[2],weibul = Loglike[3],gamma = Loglike[4],GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32),rep(1,120),rep(2,10),rep(3,67),rep(4,14),rep(5,7),6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_distr <- do.call(rbind,by(data = data_to_plot,INDICES = data_to_plot$DLT_Code,FUN=fdistr))
我尝试使用 max_y
,然后在 ylim
中使用它。我只能为正常密度做,但不能为其余曲线做。
目前的情节是这样的(一些曲线被剪掉了):
如果设置 ylim = c(0,2)
,我们可以看到,对数正态分布和伽马分布超过 1:
我需要知道每条曲线的最大值,因此,当我选择要打印的曲线时,要设置正确的 ylim
。
解决方法
您可以使用 purrr::map_dbl
将函数 optimize
映射到您的密度上,如果您稍微重新排列您的代码并且您知道要找到它们的最大值/密度存在的输入值。
您可以使用任何参数提前设置密度,这样您就可以使用 optimize
找到它们的峰值并将它们传递给 curve
函数。
作为一个可重现的小例子:
library(purrr)
# parameterize your densities
mynorm <- function(x) dnorm(x,mean = 0,sd = 1)
mygamma <- function(x) dgamma(x,rate = .5,shape = 1)
# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm,mygamma),~ optimize(.,interval = c(0,3),maximum = T)$objective))
# 0.4999811
# plot data
curve(mynorm,col = "blue",lwd = 2,xlim = c(0,ylim = c(0,ymax * 1.1))
curve(mygamma,col = "red",add = T)
使用您的代码我已经实现了上述解决方案并调整了 x
函数的 curve
网格,以便在我们在评论中讨论后向您展示我的意思,使事情更清楚并向您展示您实际应该绘制的内容:
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT,std.err=FALSE)
distr <- c('norm','lnorm','weibull','gamma')
fit <- lapply(X=distr,FUN=fitdist,data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit,FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
# parameterize your densities before plotting
mynorm <- function(x) {
dnorm(x,mean=fit[['norm']]$estimate[1],sd=fit[['norm']]$estimate[2])
}
mylnorm <- function(x){
dlnorm(x,meanlog=fit[['lnorm']]$estimate[1],sdlog=fit[['lnorm']]$estimate[2])
}
myweibull <- function(x) {
dweibull(x,shape=fit[['weibull']]$estimate[1],scale=fit[['weibull']]$estimate[2])
}
mygamma <- function(x) {
dgamma(x,shape=fit[['gamma']]$estimate[1],rate=fit[['gamma']]$estimate[2])
}
mygev <- function(x){
dgev(x,loc=fit[['gev']]$estimate[1],scale=fit[['gev']]$estimate[2],shape=fit[['gev']]$estimate[3])
}
distributions <- c(mynorm,mylnorm,myweibull,mygamma,mygev)
# get the max of each density
y <- purrr::map_dbl(distributions,x_data),maximum = T)$objective)
# find the max (excluding infinity)
ymax <- max(y[abs(y) < Inf])
hist(TLT,prob=TRUE,breaks= x_data,main=paste(d$DLT_Code[1],'- best :',names(Loglike[Loglike_Best])),sub = 'Total Lead Times',col='lightgrey',border='white',ylim= c(0,ymax)
)
lines(density(TLT),col='darkgrey',lty=2,lwd=2)
grid(nx = NA,ny = NULL,col = "gray",lty = "dotted",lwd = .5,equilogs = TRUE)
curve(mynorm,add=TRUE,col='blue',lwd=2,n = 1E5) # <- increase x grid
curve(mylnorm,col='darkgreen',n = 1E5) # <- increase x grid
curve(myweibull,col='purple',n = 1E5) # <- increase x grid
curve(mygamma,col='Gold',n = 1E5) # <- increase x grid
curve(mygev,col='red',n = 1E5) # <- increase x grid
legend_loglik <- paste(c('Norm','LogNorm','Weibull','Gamma','GEV'),c(':'),round(Loglike,digits=2))
legend("topright",legend=legend_loglik,col=c('blue','darkgreen','purple','gold','red'),lty=1,bty='o',bg='white',box.lty=2,box.lwd = 1,box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],n = length(d$TLT),Best = names(Loglike[Loglike_Best]),lnorm = Loglike[1],norm = Loglike[2],weibul = Loglike[3],gamma = Loglike[4],GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32),rep(1,120),rep(2,10),rep(3,67),rep(4,14),rep(5,7),6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind,by(data = data_to_plot,INDICES = data_to_plot$DLT_Code,FUN=fdistr))
为什么您的绘图高度与解决方案输出不匹配
为了进一步说明您的绘图发生了什么以及您可能遇到的一些困惑,您需要了解 curve
函数如何绘制您的数据。默认情况下,curve
接受 101 个 x 值并评估您的函数以获得它们的 y 值,然后将这些点绘制为一条线。由于某些密度的峰值非常尖锐,因此 curve
函数没有评估足够的 x 值来绘制密度峰值。为了表明你想要我的意思是我将专注于你的伽马密度。不要像输出一样担心代码。下面我有 n
的不同值的前几个 (x,y) 坐标。
library(purrr)
mygamma <- function(x) {
dgamma(x,# 0.6225622
rate=fit[['gamma']]$estimate[2]) # 0.3568242
}
number_of_x <- c(5,10,101,75000)
purrr::imap_dfr(number_of_x,~ curve(mygamma,6),n = .),.id = "n") %>%
dplyr::mutate_at(1,~ sprintf("n = %i",number_of_x[as.numeric(.)])) %>%
dplyr::mutate(n = factor(n,unique(n))) %>%
dplyr::filter(x > 0) %>%
dplyr::group_by(n) %>%
dplyr::slice_min(order_by = x,n = 5)
n x y
<fct> <dbl> <dbl>
1 n = 5 1.5 0.184
2 n = 5 3 0.0828
3 n = 5 4.5 0.0416
4 n = 5 6 0.0219
5 n = 10 0.667 0.336
6 n = 10 1.33 0.204
7 n = 10 2 0.138
8 n = 10 2.67 0.0975
9 n = 10 3.33 0.0707
10 n = 101 0.06 1.04
11 n = 101 0.12 0.780
12 n = 101 0.18 0.655
13 n = 101 0.24 0.575
14 n = 101 0.3 0.518
15 n = 75000 0.0000800 12.9
16 n = 75000 0.000160 9.90
17 n = 75000 0.000240 8.50
18 n = 75000 0.000320 7.62
19 n = 75000 0.000400 7.01
请注意,当 n = 5
时,您绘制的值很少。随着 n
的增加,x 值之间的距离变小。由于这些函数是连续的,因此要绘制无限数量的点,但这无法通过计算完成,因此绘制了 x 值的子集以进行近似。 x 值越多,近似效果越好。通常,默认的 n = 101
工作正常,但由于伽玛和对数正态密度具有如此尖锐的峰值,绘图函数会越过最大值。下面是添加了点的 n = 5,75000
数据的完整图。
最后我使用了这个解决方案,找到了here:
mygamma <- function(x) dgamma(x,rate=fit[['gamma']]$estimate[2])
get_curve_values <- function(fn,x_data){
res <- curve(fn,from=0,to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma,x_data)
ylim <- max(curve_val$y,na.rm = TRUE)