分布拟合中的 PDF 行Python

问题描述

我正在尝试使用 python 中的分布拟合来拟合数据。

问题： 直方图上的概率密度函数 (PDF) 线不完整，如图所示。有没有办法获取所有数据的 PDF 行？我不确定我是否在设置正确的轴或比例时犯了错误。例如 'density=True' 参数正确还是 array bin_centers 为我的 x 轴？

我尝试使用现有的 answer 解决此问题，但无法解决问题。

测试数据：可用here

我正在使用的脚本：

import pandas as pd
import numpy as np
import scipy
import scipy.stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from numpy import percentile

#%% Test dataset
y=pd.read_csv('Clean.csv',squeeze=True,na_filter=True,header=None,index_col=None) 

x = np.arange(len(y))
size = len(y)

df=pd.DataFrame(data=y)

fig = plt.figure(figsize=(9,3))
plt.hist(y)
plt.show()

# Create an index array (x) for data
#%%


y_std=round(y)

import warnings
warnings.filterwarnings("ignore")


dist_names = ['beta','expon','exponnorm','lognorm','pearson3'
               ]

# Set up empty lists to stroe results
chi_square = []
p_values = []
KS = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,80,31)
percentile_cutoffs = np.percentile(y_std,percentile_bins)
observed_frequency,bins = (np.histogram(y_std,bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)

# Loop through candidate distributions

for distribution in dist_names:
    s1 = time()
    # Set up distribution and get fitted distribution parameters
    dist = getattr(scipy.stats,distribution)
    param = dist.fit(y_std)   
    p = scipy.stats.kstest(y_std,distribution,args=param)[1]
    p = np.around(p,5)
    p_values.append(p) 
    
    
    ks = scipy.stats.kstest(y_std,args=param)
    ks = np.around(ks,5)
    KS.append((distribution,ks[0],ks[1]))
    
    
    # Get expected counts in percentile bins
    # This is based on a 'cumulative distrubution function' (cdf)
    cdf_fitted = dist.cdf(percentile_cutoffs,*param[:-2],loc=param[-2],scale=param[-1])
    expected_frequency = []
    for bin in range(len(percentile_bins)-1):
        expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
        expected_frequency.append(expected_cdf_area)
    
    # calculate chi-squared
    expected_frequency = np.array(expected_frequency) * size
    cum_expected_frequency = np.cumsum(expected_frequency)
    ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
    chi_square.append(ss)
    print(f"chi_square {distribution} time: {time() - s1}")
        
# Collate results and sort by goodness of fit (best at top)

results = pd.DataFrame()
results['distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results['KS_Test'] = KS
results.sort_values(['chi_square'],inplace=True)
    
# Report results

print ('\ndistributions sorted by goodness of fit:')
print ('----------------------------------------------------------------------------- ')
print (results)

#%%

# Divide the observed data into 100 bins for plotting (this can be changed)
number_of_bins = 20
bin_cutoffs = np.linspace(np.percentile(y,0),np.percentile(y,99),number_of_bins)

# Create the plot
fig = plt.figure(figsize=(7,5))
h = plt.hist(y,bins = bin_cutoffs,color='0.75')

# Get the top three distributions from the prevIoUs phase
number_distributions_to_plot = 5
dist_names = results['distribution'].iloc[0:number_distributions_to_plot]

# Create an empty list to stroe fitted distribution parameters
parameters = []

# Loop through the distributions ot get line fit and paraemters

for dist_name in dist_names:
    # Set up distribution and store distribution paraemters
    dist = getattr(scipy.stats,dist_name)
    param = dist.fit(y)
      # Separate parts of parameters
   
    parameters.append(param)
    
    # Get line for each distribution (and scale to match observed data)
    pdf_fitted = dist.pdf(x,scale=param[-1])
    scale_pdf = np.trapz (h[0],h[1][:-1]) / np.trapz (pdf_fitted,x)
    pdf_fitted *= scale_pdf
    
    # Add the line to the plot
    plt.plot(pdf_fitted,label=dist_name)
    
    # Set the plot x axis to contain 99% of the data
    # This can be removed,but sometimes outlier data makes the plot less clear
    plt.xlim(0,90))

# Add legend and display plot

plt.legend()
plt.show()

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

distribution distribution matplotlib python scipy scipy statistics