问题描述
我正在尝试使用 python 中的分布拟合来拟合数据。
问题: 直方图上的概率密度函数 (PDF) 线不完整,如图所示。有没有办法获取所有数据的 PDF 行?我不确定我是否在设置正确的轴或比例时犯了错误。例如 'density=True'
参数正确还是 array bin_centers
为我的 x 轴?
我尝试使用现有的 answer 解决此问题,但无法解决问题。
测试数据:可用here
我正在使用的脚本:
import pandas as pd
import numpy as np
import scipy
import scipy.stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from numpy import percentile
#%% Test dataset
y=pd.read_csv('Clean.csv',squeeze=True,na_filter=True,header=None,index_col=None)
x = np.arange(len(y))
size = len(y)
df=pd.DataFrame(data=y)
fig = plt.figure(figsize=(9,3))
plt.hist(y)
plt.show()
# Create an index array (x) for data
#%%
y_std=round(y)
import warnings
warnings.filterwarnings("ignore")
dist_names = ['beta','expon','exponnorm','lognorm','pearson3'
]
# Set up empty lists to stroe results
chi_square = []
p_values = []
KS = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,80,31)
percentile_cutoffs = np.percentile(y_std,percentile_bins)
observed_frequency,bins = (np.histogram(y_std,bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)
# Loop through candidate distributions
for distribution in dist_names:
s1 = time()
# Set up distribution and get fitted distribution parameters
dist = getattr(scipy.stats,distribution)
param = dist.fit(y_std)
p = scipy.stats.kstest(y_std,distribution,args=param)[1]
p = np.around(p,5)
p_values.append(p)
ks = scipy.stats.kstest(y_std,args=param)
ks = np.around(ks,5)
KS.append((distribution,ks[0],ks[1]))
# Get expected counts in percentile bins
# This is based on a 'cumulative distrubution function' (cdf)
cdf_fitted = dist.cdf(percentile_cutoffs,*param[:-2],loc=param[-2],scale=param[-1])
expected_frequency = []
for bin in range(len(percentile_bins)-1):
expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
expected_frequency.append(expected_cdf_area)
# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
chi_square.append(ss)
print(f"chi_square {distribution} time: {time() - s1}")
# Collate results and sort by goodness of fit (best at top)
results = pd.DataFrame()
results['distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results['KS_Test'] = KS
results.sort_values(['chi_square'],inplace=True)
# Report results
print ('\ndistributions sorted by goodness of fit:')
print ('----------------------------------------------------------------------------- ')
print (results)
#%%
# Divide the observed data into 100 bins for plotting (this can be changed)
number_of_bins = 20
bin_cutoffs = np.linspace(np.percentile(y,0),np.percentile(y,99),number_of_bins)
# Create the plot
fig = plt.figure(figsize=(7,5))
h = plt.hist(y,bins = bin_cutoffs,color='0.75')
# Get the top three distributions from the prevIoUs phase
number_distributions_to_plot = 5
dist_names = results['distribution'].iloc[0:number_distributions_to_plot]
# Create an empty list to stroe fitted distribution parameters
parameters = []
# Loop through the distributions ot get line fit and paraemters
for dist_name in dist_names:
# Set up distribution and store distribution paraemters
dist = getattr(scipy.stats,dist_name)
param = dist.fit(y)
# Separate parts of parameters
parameters.append(param)
# Get line for each distribution (and scale to match observed data)
pdf_fitted = dist.pdf(x,scale=param[-1])
scale_pdf = np.trapz (h[0],h[1][:-1]) / np.trapz (pdf_fitted,x)
pdf_fitted *= scale_pdf
# Add the line to the plot
plt.plot(pdf_fitted,label=dist_name)
# Set the plot x axis to contain 99% of the data
# This can be removed,but sometimes outlier data makes the plot less clear
plt.xlim(0,90))
# Add legend and display plot
plt.legend()
plt.show()
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)