Python scipy.stats.stats 模块,pearsonr() 实例源码
我们从Python开源项目中,提取了以下25个代码示例,用于说明如何使用scipy.stats.stats.pearsonr()。
def determineCoefficientDifference(self, coefficients):
targetList = []
comparisionList = []
for key in self.wantedCoefficients:
targetList.append(self.wantedCoefficients[key])
if key in coefficients:
comparisionList.append(coefficients[key])
else:
comparisionList.append(0)
for key in coefficients:
if key in self.wantedCoefficients:
continue
else:
targetList.append(0)
comparisionList.append(coefficients[key])
return pearsonr(targetList, comparisionList)
def plot_scatter_charts(data, file_name):
scatters = []
for lang, values in data.items():
s = figure(width=300, plot_height=300, title=lang)
s.yaxis.formatter = NumeralTickFormatter(format="0.0a")
s.circle(values[0], values[1], size=10, color="navy", alpha=0.5)
x = np.linspace(1, 100, 10)
# noinspection PyTupleAssignmentBalance
m, b = np.polyfit(values[0], 1)
y = m * x + b
corr_coef = round(pearsonr(values[0], values[1])[0], 1)
s.line(x, y, legend=f'PCC = {corr_coef}')
scatters.append(s)
split_scatters = split(scatters, 3)
p = gridplot(split_scatters)
output_file(file_name)
show(p)
def correlation(self, x, show=True):
'''
Computes Pearson's correlation value of variables x and y.
Diagonal values are removed.
:param x: numpy array independent variable
:param y: numpu array dependent variable
:param show: if True then shows pearson's correlation and p-value.
:return:
'''
if not self.diagonal:
xflatten = np.delete(x, [i*(x.shape[0]+1)for i in range(x.shape[0])])
yflatten = np.delete(y, [i*(y.shape[0]+1)for i in range(y.shape[0])])
pc = pearsonr(xflatten, yflatten)
else:
pc = pearsonr(x.flatten(), y.flatten())
if show:
utils.printf('Pearson Correlation: {}'.format(pc[0]))
utils.printf('p-value: {}'.format(pc[1]))
return pc
#####################################################################################
# Handlers
#####################################################################################
def get_best_two_params(self):
param_names = self.jobs.get_param_names()
if len(param_names) == 2:
return param_names # there can be only two.
# how much does each parameter correlate with the achieved loss...
param_losscorr = {}
for name in self.param_names:
corr_coef, pval = pearsonr( self.losses, self.param_values[name] )
logging.info('Correlation of {} with loss: {}'.format(name, corr_coef))
param_losscorr[name] = abs(corr_coef) # abs,since we don't care about the direction
sorted_by_corr = sorted(param_losscorr.items(), key=lambda x:x[1], reverse=True)
best_params = []
for i in sorted_by_corr:
if math.isnan( i[1] ): continue
best_params.append(i[0])
if len(best_params) == 2: return best_params
return best_params
#return sorted_by_corr[0][0],sorted_by_corr[1][0] # Todo: Could be made more general/robust
def correlations_ground_truth():
print 'ground truth'
#load network
wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering.xml.gz")
#read counts with zeros
article_counts = pd.read_csv(TMP+'article_counts.tsv', sep='\t')
cor = {}
for damping in [0.8,0.9]:
page_rank = pagerank(wikipedia, damping=damping)
wikipedia.vertex_properties['page_rank_'+str(damping)] = page_rank
page_rank_values = list()
counts = list()
correlations_values = {}
for index, row in article_counts.iterrows():
counts.append(float(row['counts']))
page_rank_values.append(page_rank[wikipedia.vertex(int(row['target_article_id']))])
print 'pearson'
p = pearsonr(page_rank_values, counts)
print p
correlations_values['pearson']=p
print 'spearmanr'
s = spearmanr(page_rank_values, counts)
print s
correlations_values['spearmanr']=s
print 'kendalltau'
k = kendalltau(page_rank_values, counts)
print k
correlations_values['kendalltau']=k
cor['page_rank_'+str(damping)]=correlations_values
write_pickle(HOME+'output/correlations/correlations_pagerank.obj', cor)
def correlations_weighted_unweighted(labels):
#load network
print 'weighted vs unweighted'
name = '_'.join(labels)
wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
#read counts with zeros
wikipedia_u = load_graph("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz")
correlations_weighted_pagerank = {}
for label in labels:
for damping in [0.8,0.85,0.9]:
correlations_values={}
key_weighted = label+"_page_rank_weighted_"+str(damping)
pagerank_weighted = wikipedia.vertex_properties[key_weighted]
key_unweighted = "page_rank"+str(damping)
pagerank_unweighted = wikipedia_u.vertex_properties[key_unweighted]
print 'pearson'
p = pearsonr(pagerank_weighted.a, pagerank_unweighted.a)
print p
correlations_values['pearson']=p
print 'spearmanr'
s = spearmanr(pagerank_weighted.a, pagerank_unweighted.a)
print s
correlations_values['spearmanr']=s
print 'kendalltau'
k = kendalltau(pagerank_weighted.a, pagerank_unweighted.a)
print k
correlations_values['kendalltau']=k
correlations_weighted_pagerank[label+str(damping)]=correlations_values
write_pickle(HOME+'output/correlations/correlations_pagerank_weightedvsunweighted'+name+'.obj', correlations_weighted_pagerank)
def cor_analysis(co_price, pcb_price):
"""
???PCB???????
"""
cor_draw(co_price, pcb_price)
print(pearsonr(co_price.values, pcb_price.values))
def compute_corr(self, ref_data, gen_data):
corr_coef = pearsonr(ref_data, gen_data)
return corr_coef[0]
def sum_corr(view1,view2,flag=''):
print("test correlation")
corr = 0
for i,j in zip(view1,view2):
corr += measures.pearsonr(i,j)[0]
print('avg sum corr ::',flag,'::',corr/len(view1))
def cal_sim(model,ind1,ind2=1999):
view1 = np.load("test_v1.npy")[0:ind1]
view2 = np.load("test_v2.npy")[0:ind2]
label1 = np.load('test_l.npy')
x1 = project(model,[view1,np.zeros_like(view1)])
x2 = project(model,[np.zeros_like(view2),view2])
label2 = []
count = 0
MAP=0
for i,j in enumerate(x1):
cor = []
AP=0
for y in x2:
temp1 = j.tolist()
temp2 = y.tolist()
cor.append(pearsonr(temp1,temp2))
#if i == np.argmax(cor):
# count+=1
#val=[(q,(i*ind1+p))for p,q in enumerate(cor)]
val=[(q,p)for p,q in enumerate(cor)]
val.sort()
val.reverse()
label2.append(val[0:4])
t = [w[1]for w in val[0:7]]
#print t
for x,y in enumerate(t):
if y in range(i,i+5):
AP+=1/(x+1)
print(t)
print(AP)
MAP+=AP
#print 'accuracy :- ',float(count)*100/ind1,'%'
print('MAP is : ',MAP/ind1)
def mospat_manip_calcstats(c_Variable, c_Model, f_ObsData, f_ModelData):
# ELIMINATING ELEMENTS WITH NAN
idx_ModData=np.where(~np.isnan(f_ModelData))
idx_ObsData=np.where(~np.isnan(f_ObsData))
f_ObsData_aux=f_ObsData[idx_ModData]
f_ModelData_aux=f_ModelData[idx_ModData]
# Model Mean
f_ModMean=np.nanmean(f_ModelData_aux)
# Obs Mean
f_ObsMean=np.nanmean(f_ObsData_aux)
# Mean Bias
f_MeanBias=f_ModMean-f_ObsMean
# Mean normalized Bias
f_mnb=(f_ModMean-f_ObsMean)/f_ObsMean
# Root Mean Square Error
f_rms=np.sqrt(((f_ModelData_aux-f_ObsData_aux)**2).mean())
# Pearson Correlation Coefficient
f_corr=pearsonr(f_ObsData_aux,f_ModelData_aux)[0]
# Standard Deviation of Observations
f_Stdobs=np.std(f_ObsData_aux)
# Standard Deviation of Model Data
f_StdMod=np.std(f_ModelData_aux)
# Ratio of Standard Deviation
f_Stdratio=f_StdMod/f_Stdobs
f_Statistics=[f_ObsMean, f_ModMean, f_MeanBias, f_mnb, f_rms, f_corr, f_Stdobs, f_StdMod, f_Stdratio]
return f_Statistics
def evaluate(DATA_SET):
PREDS = [predict(u, m) for (u, m, r) in DATA_SET]
REALS = [r for (u, r) in DATA_SET]
mae = sum(abs(REALS[i] - PREDS[i]) for i in range(len(PREDS)))/len(PREDS)
print 'MAE = ', round(mae, 3)
r, p = pearsonr(PREDS, REALS)
print 'cor = ', round(r, 3)
def adbPredictor(df):
dataTrainX, dataTrainY, dataTestX, dataTestY = sample(df)
# clf = linear_model.SGDRegressor()
clf = ensemble.AdaBoostRegressor()
clf.fit(dataTrainX, dataTrainY)
predicted = clf.predict(dataTestX)
fig, ax = plotter.subplots()
ax.set_ylabel('Predicted KNN Weekly')
ax.scatter(dataTestY, predicted)
ax.set_xlabel('Measured')
predicted = np.reshape(predicted, (predicted.size, 1))
corrCoeff = pearsonr(dataTestY,predicted)
print(corrCoeff[0])
plotter.show()
return predicted
def knnPredictor(df):
dataTrainX, dataTestY = sample(df)
corelationCoefficiantDictionary = {}
corelationCoefficiantArray = []
for k in range(1, 200, 1):
knnModel = KNeighborsRegressor(n_neighbors=k)
knnModel.fit(dataTrainX, dataTrainY)
knnpredicted = knnModel.predict(dataTestX)
corelationCoefficient = pearsonr(dataTestY, knnpredicted)
corelationCoefficiantDictionary[k] = corelationCoefficient[0]
corelationCoefficiantArray.append(corelationCoefficient[0])
# plotter.plot(corelationCoefficiantArray)
bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get)
knnModelBest = KNeighborsRegressor(n_neighbors=bestK)
knnModelBest.fit(dataTrainX, dataTrainY)
print("K = ")
print(bestK)
print("Corelation Coeff:")
print(corelationCoefficiantDictionary[bestK])
knnpredictedBest = knnModelBest.predict(dataTestX)
fig, ax = plotter.subplots()
corelationCoefficient = pearsonr(dataTestY, knnpredictedBest)
print(corelationCoefficient[0])
ax.set_ylabel('Predicted KNN Weekly')
ax.scatter(dataTestY, knnpredictedBest)
ax.set_xlabel('Measured')
plotter.show()
def randomForestPredictor(df):
# bbValTest,bbValTrain,ptChangeTest,ptChangeTrain = sample(df)
dataTrainX, 1):
rfsModel = RandomForestRegressor(n_estimators=k)
rfsModel.fit(dataTrainX, dataTrainY)
rfspredicted = rfsModel.predict(dataTestX)
rfspredicted = np.reshape(rfspredicted, (rfspredicted.size, 1))
corelationCoefficient = pearsonr(dataTestY, rfspredicted)
corelationCoefficiantDictionary[k] = corelationCoefficient[0]
corelationCoefficiantArray.append(corelationCoefficient[0])
plotter.plot(corelationCoefficiantArray)
# plotter.show()
bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get)
rfsModelBest = RandomForestRegressor(n_estimators=bestK)
rfsModelBest.fit(dataTrainX, dataTrainY)
print("K = ")
print(bestK)
print("Correlation Coefficient =")
print(corelationCoefficiantDictionary[bestK])
rfsPredictedBest = rfsModelBest.predict(dataTestX)
fig, ax = plotter.subplots()
ax.set_ylabel('Predicted RandomForest Weekly')
ax.scatter(dataTestY, rfsPredictedBest)
ax.set_xlabel('Measured')
plotter.show()
def predictKnn(data, pricetoPredict):
corelationCoefficiantDictionary = {}
corelationCoefficiantArray = []
openingPriceTrain, openingPriceTest, closingPriceTrain, closingPriceTest = \
data["openingPriceTrain"], data["openingPriceTest"], data["closingPriceTrain"], data["closingPriceTest"]
for k in range( 1 , 100 , 1):
neigh = KNeighborsRegressor(n_neighbors=k)
#n = 7 best fits
neigh.fit(openingPriceTrain, closingPriceTrain)
closingPriceTestArray = np.reshape(closingPriceTest,-1)
knnpr = neigh.predict(openingPriceTest)
predictedArray = np.reshape(knnpr,-1)
corelationCoefficient = pearsonr(closingPriceTestArray,predictedArray)
corelationCoefficiantDictionary[k] = corelationCoefficient[0]
corelationCoefficiantArray.append(corelationCoefficient[0])
plotter.plot(corelationCoefficiantArray)
# plotter.show()
bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get)
neighBest = KNeighborsRegressor(n_neighbors=bestK)
neighBest.fit(openingPriceTrain, closingPriceTrain)
openingPricetoPredict = np.array([pricetoPredict])
print("K = ")
print(bestK)
print(neighBest.predict(openingPricetoPredict))
def predict(data, pricetoPredict):
openingPriceTrain, data["closingPriceTest"]
clf = svm.LinearSVR()
clf.fit(openingPriceTrain, closingPriceTrain)
predicted2 = clf.predict(openingPriceTest)
score = clf.fit(openingPriceTrain, closingPriceTrain).score(openingPriceTest, closingPriceTest)
# print(score)
fig, ax = plotter.subplots()
ax.scatter(openingPriceTrain, closingPriceTrain)
ax.set_ylabel('Predicted SVM')
ax.scatter(closingPriceTest, clf.predict(openingPriceTest))
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
# plotter.show()
closingPriceTestArray = np.reshape(closingPriceTest,-1)
clfpr = clf.predict(openingPriceTest)
predictedArray = np.reshape(clfpr,-1)
print(pearsonr(closingPriceTestArray,predictedArray))
openingPricetoPredict = np.array([pricetoPredict])
print(clf.predict(openingPricetoPredict))
return clf.predict(np.array([openingPricetoPredict]))
def Compare_results(test_data,Desctree):
predict = []
predict = PrintClass(test_data,Desctree)
test = []
# i = 0
# TruePositive = 0
# TrueNegative = 0
# FalsePositive = 0
# FalseNegative = 0
#print("check")
for testrec in test_data:
test.append(float(testrec[0]))
#print("test",test)
R2 = pearsonr(predict,test)
# if testrec[0] == predict[i]:
# if predict[i] == 1:
# TruePositive += 1
# else:
# TrueNegative += 1
# else:
# if predict[i] == 1:
# FalsePositive += 1
# else:
# FalseNegative += 1
# # Falsecount +=1
# i = i + 1
# Accuracy = float((TruePositive + TrueNegative)/ (TruePositive+FalsePositive + TrueNegative + FalseNegative))
return(R2)
def calc_correlations(data_file, genre_column, network_metric_columns, output_path=None):
dataframe = load_master_file(data_file)
target_df = dataframe[genre_column]
correlations = {}
index = 0
for column in network_metric_columns:
try:
trimmed_df = dataframe.filter(items=[genre_column, column])
trimmed_df = trimmed_df[np.isfinite(trimmed_df[genre_column])]
trimmed_df = trimmed_df[np.isfinite(trimmed_df[column])]
trimmed_df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
correlations[genre_column + '_|_' + column ] = pearsonr(trimmed_df[genre_column],trimmed_df[column])
except:
print "Error calculating correaltion"
index += 1
# Option: Print Correlations to CSV
if output_path:
with open(output_path, 'wb') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["Variable", "pearson", "p-value"])
for key, value in correlations.items():
writer.writerow([key, value[0],value[1]])
return correlations
def solution6():
lookup = getTweets()
for i in tags:
for j in tags:
print (pearsonr(lookup[i],lookup[j]))
def correlations(network_name):
db = MysqLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
# wikipedia graph structural statistics
results = None
try:
results = cursor.execute('select c.curr_id,sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",))
results = cursor.fetchall()
except MysqLdb.Error, e:
print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
print 'after sql load'
print 'before load'
wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_"+network_name+".xml.gz")
print 'after load'
cor = {}
#for kk in ['page_rank','page_rank_weighted']:
for kk in ['page_rank_weighted']:
correlations_sem_sim_weighted_pagerank ={}
#for damping in [0.8,0.85,0.9,0.95]:
for damping in [0.85]:
correlations={}
print damping
key = kk+str(damping)
print key
pagerank = wikipedia.vertex_properties[key]
counts=[]
page_rank_values=[]
for row in results:
counts.append(float(row[1]))
page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))])
#for index,row in df.iterrows():
# counts.append(float(row['counts']))
# page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))])
print 'pearson'
p = pearsonr(page_rank_values, counts)
print p
correlations['pearson']=p
print 'spearmanr'
s= spearmanr(page_rank_values, counts)
print s
correlations['spearmanr']=s
print 'kendalltau'
k= kendalltau(page_rank_values, counts)
print k
correlations['kendalltau']=k
correlations_sem_sim_weighted_pagerank[key]=correlations
cor[kk]=correlations_sem_sim_weighted_pagerank
write_pickle(HOME+'output/correlations/correlations_pagerank_without_zeros'+network_name+'.obj', cor)
def find_feature_transformation(feature_name, feature_value, scores):
"""
Identify the best transformation based on the
highest absolute Pearson correlation with human score.
Parameters
----------
feature_name: str
Name of feature for which to find the transformation.
feature_value: pandas Series
Series containing feature values.
scores: pandas Series
Numeric human scores.
Returns
-------
best_transformation: str
The name of the transformation which gives the highest correlation
between the feature values and the human scores. See
:ref:`documentation <select_transformations_rsmtool>` for the
full list of transformations.
"""
# Do not use sqrt and ln for potential negative features.
# Do not use inv for positive features.
if any(feature_value < 0):
applicable_transformations = ['org', 'inv']
else:
applicable_transformations = ['org', 'sqrt', 'addOneInv', 'addOneLn']
correlations = []
for trans in applicable_transformations:
try:
transformed_value = transform_feature(feature_name, trans)
correlations.append(abs(pearsonr(transformed_value, scores)[0]))
except ValueError:
# If the transformation returns an error,append 0.
correlations.append(0)
best = np.argmax(correlations)
best_transformation = applicable_transformations[best]
return best_transformation
def calc_median_angle_params(subject):
"""
Calculates median angle parameters of a subject
Parameters
----------
subject : string
Path of a subject's nifti file.
Returns
-------
mean_bold : float
Mean bold amplitude of a subject.
median_angle : float
Median angle of a subject.
"""
import numpy as np
import nibabel as nb
data = nb.load(subject).get_data().astype('float64')
mask = (data != 0).sum(-1) != 0
print 'Loaded ' + subject
print 'Volume size ' + data.shape
Y = data[mask].T
print 'Data shape ' + Y.shape
Yc = Y - np.tile(Y.mean(0), (Y.shape[0], 1))
Yn = Yc/np.tile(np.sqrt((Yc*Yc).sum(0)), (Yc.shape[0], 1))
U,S,Vh = np.linalg.svd(Yn, full_matrices=False)
glb = (Yn/np.tile(Yn.std(0), 1))).mean(1)
from scipy.stats.stats import pearsonr
corr = pearsonr(U[:,0],glb)
print "PC1_glb r: " + corr
PC1 = U[:,0] if corr[0] >= 0 else -U[:,0]
median_angle = np.median(np.arccos(np.dot(PC1.T, Yn)))
median_angle *= 180.0/np.pi
Yp = Yc
#/np.tile(Y.mean(0),(Y.shape[0],1))
mean_bold = Yp.std(0).mean()
print 'Median Angle ' + median_angle
print 'Mean Bold ' + mean_bold
return mean_bold, median_angle