分布拟合中的 PDF 行Python

问题描述

我正在尝试使用 python 中的分布拟合来拟合数据。

问题: 直方图上的概率密度函数 (PDF) 线不完整,如图所示。有没有办法获取所有数据的 PDF 行?我不确定我是否在设置正确的轴或比例时犯了错误。例如 'density=True' 参数正确还是 array bin_centers 为我的 x 轴?

我尝试使用现有的 answer 解决此问题,但无法解决问题。

测试数据:可用here

enter image description here

我正在使用的脚本:

import pandas as pd
import numpy as np
import scipy
import scipy.stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from numpy import percentile

#%% Test dataset
y=pd.read_csv('Clean.csv',squeeze=True,na_filter=True,header=None,index_col=None) 

x = np.arange(len(y))
size = len(y)

df=pd.DataFrame(data=y)

fig = plt.figure(figsize=(9,3))
plt.hist(y)
plt.show()

# Create an index array (x) for data
#%%


y_std=round(y)

import warnings
warnings.filterwarnings("ignore")


dist_names = ['beta','expon','exponnorm','lognorm','pearson3'
               ]

# Set up empty lists to stroe results
chi_square = []
p_values = []
KS = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,80,31)
percentile_cutoffs = np.percentile(y_std,percentile_bins)
observed_frequency,bins = (np.histogram(y_std,bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)

# Loop through candidate distributions

for distribution in dist_names:
    s1 = time()
    # Set up distribution and get fitted distribution parameters
    dist = getattr(scipy.stats,distribution)
    param = dist.fit(y_std)   
    p = scipy.stats.kstest(y_std,distribution,args=param)[1]
    p = np.around(p,5)
    p_values.append(p) 
    
    
    ks = scipy.stats.kstest(y_std,args=param)
    ks = np.around(ks,5)
    KS.append((distribution,ks[0],ks[1]))
    
    
    # Get expected counts in percentile bins
    # This is based on a 'cumulative distrubution function' (cdf)
    cdf_fitted = dist.cdf(percentile_cutoffs,*param[:-2],loc=param[-2],scale=param[-1])
    expected_frequency = []
    for bin in range(len(percentile_bins)-1):
        expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
        expected_frequency.append(expected_cdf_area)
    
    # calculate chi-squared
    expected_frequency = np.array(expected_frequency) * size
    cum_expected_frequency = np.cumsum(expected_frequency)
    ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
    chi_square.append(ss)
    print(f"chi_square {distribution} time: {time() - s1}")
        
# Collate results and sort by goodness of fit (best at top)

results = pd.DataFrame()
results['distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results['KS_Test'] = KS
results.sort_values(['chi_square'],inplace=True)
    
# Report results

print ('\ndistributions sorted by goodness of fit:')
print ('----------------------------------------------------------------------------- ')
print (results)

#%%

# Divide the observed data into 100 bins for plotting (this can be changed)
number_of_bins = 20
bin_cutoffs = np.linspace(np.percentile(y,0),np.percentile(y,99),number_of_bins)

# Create the plot
fig = plt.figure(figsize=(7,5))
h = plt.hist(y,bins = bin_cutoffs,color='0.75')

# Get the top three distributions from the prevIoUs phase
number_distributions_to_plot = 5
dist_names = results['distribution'].iloc[0:number_distributions_to_plot]

# Create an empty list to stroe fitted distribution parameters
parameters = []

# Loop through the distributions ot get line fit and paraemters

for dist_name in dist_names:
    # Set up distribution and store distribution paraemters
    dist = getattr(scipy.stats,dist_name)
    param = dist.fit(y)
      # Separate parts of parameters
   
    parameters.append(param)
    
    # Get line for each distribution (and scale to match observed data)
    pdf_fitted = dist.pdf(x,scale=param[-1])
    scale_pdf = np.trapz (h[0],h[1][:-1]) / np.trapz (pdf_fitted,x)
    pdf_fitted *= scale_pdf
    
    # Add the line to the plot
    plt.plot(pdf_fitted,label=dist_name)
    
    # Set the plot x axis to contain 99% of the data
    # This can be removed,but sometimes outlier data makes the plot less clear
    plt.xlim(0,90))

# Add legend and display plot

plt.legend()
plt.show()

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)