Python scipy.stats.stats 模块,spearmanr() 实例源码
我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用scipy.stats.stats.spearmanr()。
def evaluate(representation, data):
results = []
for (x, y), sim in data:
results.append((representation.similarity(x, sim))
actual, expected = zip(*results)
return spearmanr(actual, expected)[0]
def evaluate(representation, data):
results = []
oov = 0
for (x, sim in data:
if representation.oov(x) or representation.oov(y):
oov += 1
# continue
results.append((0, sim))
else:
results.append((representation.similarity(x, expected = zip(*results)
print "OOV: ", oov
return spearmanr(actual, expected)[0]
def correlations_ground_truth():
print 'ground truth'
#load network
wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering.xml.gz")
#read counts with zeros
article_counts = pd.read_csv(TMP+'article_counts.tsv', sep='\t')
cor = {}
for damping in [0.8,0.9]:
page_rank = pagerank(wikipedia, damping=damping)
wikipedia.vertex_properties['page_rank_'+str(damping)] = page_rank
page_rank_values = list()
counts = list()
correlations_values = {}
for index, row in article_counts.iterrows():
counts.append(float(row['counts']))
page_rank_values.append(page_rank[wikipedia.vertex(int(row['target_article_id']))])
print 'pearson'
p = pearsonr(page_rank_values, counts)
print p
correlations_values['pearson']=p
print 'spearmanr'
s = spearmanr(page_rank_values, counts)
print s
correlations_values['spearmanr']=s
print 'kendalltau'
k = kendalltau(page_rank_values, counts)
print k
correlations_values['kendalltau']=k
cor['page_rank_'+str(damping)]=correlations_values
write_pickle(HOME+'output/correlations/correlations_pagerank.obj', cor)
def correlations_weighted_unweighted(labels):
#load network
print 'weighted vs unweighted'
name = '_'.join(labels)
wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
#read counts with zeros
wikipedia_u = load_graph("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz")
correlations_weighted_pagerank = {}
for label in labels:
for damping in [0.8,0.85,0.9]:
correlations_values={}
key_weighted = label+"_page_rank_weighted_"+str(damping)
pagerank_weighted = wikipedia.vertex_properties[key_weighted]
key_unweighted = "page_rank"+str(damping)
pagerank_unweighted = wikipedia_u.vertex_properties[key_unweighted]
print 'pearson'
p = pearsonr(pagerank_weighted.a, pagerank_unweighted.a)
print p
correlations_values['pearson']=p
print 'spearmanr'
s = spearmanr(pagerank_weighted.a, pagerank_unweighted.a)
print s
correlations_values['spearmanr']=s
print 'kendalltau'
k = kendalltau(pagerank_weighted.a, pagerank_unweighted.a)
print k
correlations_values['kendalltau']=k
correlations_weighted_pagerank[label+str(damping)]=correlations_values
write_pickle(HOME+'output/correlations/correlations_pagerank_weightedvsunweighted'+name+'.obj', correlations_weighted_pagerank)
def evaluate(m, sim in data:
# print(x,y)
if m.has_word(x) and m.has_word(y):
# print(m.get_row(x).dot(m.get_row(y)))
results.append((m.get_row(x).dot(m.get_row(y)), sim))
else:
pass
actual, expected)[0]
def evaluate(representation, data):
results = []
seen_num = 0
for (x, sim in data:
if representation.similarity(x, y) is not None :
seen_num += 1
results.append((representation.similarity(x, expected = zip(*results)
print ("seen/total: " + str(seen_num) + "/" + str(len(data)))
return spearmanr(actual, expected)[0]
def correlations(network_name):
db = MysqLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
# wikipedia graph structural statistics
results = None
try:
results = cursor.execute('select c.curr_id,sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",))
results = cursor.fetchall()
except MysqLdb.Error, e:
print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
print 'after sql load'
print 'before load'
wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_"+network_name+".xml.gz")
print 'after load'
cor = {}
#for kk in ['page_rank','page_rank_weighted']:
for kk in ['page_rank_weighted']:
correlations_sem_sim_weighted_pagerank ={}
#for damping in [0.8,0.85,0.9,0.95]:
for damping in [0.85]:
correlations={}
print damping
key = kk+str(damping)
print key
pagerank = wikipedia.vertex_properties[key]
counts=[]
page_rank_values=[]
for row in results:
counts.append(float(row[1]))
page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))])
#for index,row in df.iterrows():
# counts.append(float(row['counts']))
# page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))])
print 'pearson'
p = pearsonr(page_rank_values, counts)
print p
correlations['pearson']=p
print 'spearmanr'
s= spearmanr(page_rank_values, counts)
print s
correlations['spearmanr']=s
print 'kendalltau'
k= kendalltau(page_rank_values, counts)
print k
correlations['kendalltau']=k
correlations_sem_sim_weighted_pagerank[key]=correlations
cor[kk]=correlations_sem_sim_weighted_pagerank
write_pickle(HOME+'output/correlations/correlations_pagerank_without_zeros'+network_name+'.obj', cor)
def evaluate_sim(model, testsets, testsetNames, getAbsentWords=False, vocab_dict=None, cutPoint=-1 ):
# words in absentModelID2Word and words in absentVocabWords don't overlap
# words in the vocab but not in the model
absentModelID2Word = {}
# words not in the vocab (of coz not in the model)
absentVocabWords = {}
# words in the vocab but below the cutPoint (id > cutPoint),may be in or out of the model
cutVocabWords = {}
# a set of spearman coeffs,in the same order as in testsets
spearmanCoeff = []
for i,testset in enumerate(testsets):
modelResults = []
groundtruth = []
for x, y, sim in testset:
if vocab_dict and x in vocab_dict:
xid = vocab_dict[x][0]
if cutPoint > 0 and xid > cutPoint:
cutVocabWords[x] = 1
if vocab_dict and y in vocab_dict:
yid = vocab_dict[y][0]
if cutPoint > 0 and yid > cutPoint:
cutVocabWords[y] = 1
if x not in model:
if getAbsentWords and x in vocab_dict:
absentModelID2Word[xid] = x
else:
absentVocabWords[x] = 1
elif y not in model:
if getAbsentWords and y in vocab_dict:
absentModelID2Word[yid] = y
else:
absentVocabWords[y] = 1
else:
modelResults.append( model.similarity(x, y) )
groundtruth.append(sim)
#print "%s %s: %.3f %.3f" %(x,y,modelResults[-1],sim)
print "%s: %d test pairs,%d valid" %( testsetNames[i], len(testset), len(modelResults) ),
spearmanCoeff.append( spearmanr(modelResults, groundtruth)[0] )
print ",%.5f" %spearmanCoeff[-1]
# return hashes directly,for ease of merge
return spearmanCoeff, absentModelID2Word, absentVocabWords, cutVocabWords
# vocab_dict is a vocabulary dict,usually bigger than model.vocab,loaded from a unigram file
# its purpose is to find absent words in the model