Python sklearn.cluster 模块,SpectralClustering() 实例源码
我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用sklearn.cluster.SpectralClustering()。
def compare_clusters(X,Y,method='spectral',s=10000):
A = (X/np.linalg.norm(X,axis=0)).T
A[np.isnan(A)] = 0
B = (Y/np.linalg.norm(Y,axis=0)).T
B[np.isnan(B)] = 0
random_samples = np.zeros(A.shape[0],dtype=np.bool)
random_samples[:min(s,A.shape[0])] = True
np.random.shuffle(random_samples)
A = A[random_samples]
B = B[random_samples]
dA = 1 - A.dot(A.T)
dA = np.exp(-dA**2/2.)
dB = 1 - B.dot(B.T)
dB = np.exp(-dB**2/2.)
del A,B
if method == 'spectral':
n = max(5,min(30,X.shape[1]/50))
lA = SpectralClustering(n_clusters=n,affinity='precomputed').fit_predict(dA)
lB = SpectralClustering(n_clusters=n,affinity='precomputed').fit_predict(dB)
elif method == 'ap':
lA = AffinityPropagation(affinity='precomputed').fit_predict(dA)
lB = AffinityPropagation(affinity='precomputed').fit_predict(dB)
return adjusted_mutual_info_score(lA,lB)
def spectral(data):
spectral = SpectralClustering(
eigen_solver='arpack',
affinity='rbf',
assign_labels='discretize'
).fit(data)
print 'Spectral'
print collections.Counter(spectral.labels_)
print metrics.silhouette_score(data, spectral.labels_)
reduced_data = reduce_with_pca(data, 2)
plot_2d_data(reduced_data, spectral.labels_)
def post_proC(C, K, d, alpha):
# C: coefficient matrix,K: number of clusters,d: dimension of each subspace
C = 0.5*(C + C.T)
r = d*K + 1
U, S, _ = svds(C,r,v0 = np.ones(C.shape[0]))
U = U[:,::-1]
S = np.sqrt(S[::-1])
S = np.diag(S)
U = U.dot(S)
U = normalize(U, norm='l2', axis = 1)
Z = U.dot(U.T)
Z = Z * (Z>0)
L = np.abs(Z ** alpha)
L = L/L.max()
L = 0.5 * (L + L.T)
spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize')
spectral.fit(L)
grp = spectral.fit_predict(L) + 1
return grp, L
def post_proC(C,d: dimension of each subspace
C = 0.5*(C + C.T)
r = min(d*K + 1, C.shape[0]-1)
U, L
def post_proC(C,d: dimension of each subspace
n = C.shape[0]
C = 0.5*(C + C.T)
C = C - np.diag(np.diag(C)) + np.eye(n,n) # for sparse C,this step will make the algorithm more numerically stable
r = d*K + 1
U,v0 = np.ones(n))
U = U[:,::-1]
S = np.sqrt(S[::-1])
S = np.diag(S)
U = U.dot(S)
U = normalize(U, axis = 1)
Z = U.dot(U.T)
Z = Z * (Z>0)
L = np.abs(Z ** alpha)
L = L/L.max()
L = 0.5 * (L + L.T)
spectral = cluster.SpectralClustering(n_clusters=K, assign_labels='discretize')
spectral.fit(L)
grp = spectral.fit_predict(L) + 1
return grp, L
def post_proC(C,d: dimension of each subspace
C = 0.5*(C + C.T)
r = d*K + 1
U,assign_labels='discretize')
spectral.fit(L)
grp = spectral.fit_predict(L) + 1
return grp, L
def compute_Meta_centroid_set(self, C):
print("Intermediate clusters", C.shape)
# By eye,it looks like the top 60%-80% of the
# remaining clusters are stable...
nc = int(self.subcluster_pcut * self.subcluster_kn)
clf = SpectralClustering(n_clusters=nc, affinity="precomputed")
S = cosine_affinity(C)
labels = clf.fit_predict(S)
Meta_clusters = []
Meta_cluster_size = []
for i in range(labels.max() + 1):
idx = labels == i
mu = C[idx].mean(axis=0)
mu /= np.linalg.norm(mu)
Meta_clusters.append(mu)
Meta_cluster_size.append(idx.sum())
return np.array(Meta_clusters)
def cluster(aff_matrix, records, n_clusters, medoid_indexes):
Cluster = SpectralClustering(n_clusters=n_clusters, affinity='precomputed')
labels = Cluster.fit_predict(aff_matrix)
medoid_indexes = medoid_indexes.loc[records]
t_records = []
indexes = []
for i in range(n_clusters):
labels_i = np.where(labels == i)[0]
sub_aff_matrix = aff_matrix[labels_i, :][:, labels_i]
medoid_index = np.argmax(np.prod(sub_aff_matrix, axis=0))
absolute_index = labels_i[medoid_index]
r = medoid_indexes.index[absolute_index]
t_records.append(r)
i = medoid_indexes.iloc[absolute_index].values[0]
indexes.append(i)
return t_records, indexes
def _clusteraffinity(aff, k, imdb, cls_idx):
""" Cluster error correlation matrix using spectral clustering into k cluster,
show the class labels in each cluster.
"""
# clustering model
spectral = SpectralClustering(n_clusters=k,
eigen_solver='arpack',
affinity="precomputed")
print 'Performing clustering...'
labels = spectral.fit_predict(aff)
# print out all labels
for i in xrange(k):
find_idx = np.where(labels==i)[0]
print 'The list of classes in cluster {}'.format(i)
print [imdb.classes[id] for id in find_idx]
print '--------------------------------------------'
return labels
if __name__ == '__main__':
# Todo: debug code if necessary
pass
def cluster_spectral(X_train, model_args=None, gridsearch=True):
from sklearn.cluster import SpectralClustering
print('SpectralClustering')
if gridsearch is True:
## Todo:
# add hyperparamter searching. No scoring method available for this model,
# so we can't easily use gridsearching.
raise NotImplementedError('No hyperparameter optimization available yet for this model. Set gridsearch to False')
# prune(param_grid,model_args)
else:
if 'n_clusters' not in model_args:
raise KeyError('Need to define n_clusters for SpectralClustering')
param_grid = None
return ModelWrapper(SpectralClustering, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
def clustering( points, k=2,name='kmeans'):
'''
points: N_samples * N_features
k: number of clusters
'''
if name == 'kmeans':
kmeans = KMeans( n_clusters=k,n_init=100 ).fit(points)
## print within_variance
#cluster_distance = kmeans.transform( points )
#within_variance = sum( np.min(cluster_distance,axis=1) ) / float( points.shape[0] )
#print("AvgWithinSS:"+str(within_variance))
if len( np.unique(kmeans.labels_) ) > 1:
si = silhouette_score( points,kmeans.labels_ )
#print("Silhouette:"+str(si))
else:
si = 0
print("Silhouette:"+str(si))
return kmeans.labels_,si
if name == 'spec':
spec= SpectralClustering( n_clusters=k,affinity='cosine' ).fit( points )
si = silhouette_score( points,spec.labels_ )
print("Silhouette:"+str(si))
return spec.labels_,si
def find_spectral_alphas(self, n_alphas, max_log_alpha, n_alphas_to_return):
self.create_affinity_matrix(max_log_alpha, n_alphas)
affinity = self.affinity_matrix
spectral = cluster.SpectralClustering(n_clusters=n_alphas_to_return, affinity='precomputed')
alphas = np.concatenate(([0],np.logspace(-1,max_log_alpha,n_alphas)))
spectral.fit(affinity)
labels = spectral.labels_
best_alphas = list()
for i in range(n_alphas_to_return):
idx = np.where(labels==i)[0]
if not(0 in idx): #because we don't want to include the cluster that includes alpha=0
affinity_submatrix = affinity[idx][:, idx]
sum_affinities = np.sum(affinity_submatrix, axis=0)
exemplar_idx = idx[np.argmax(sum_affinities)]
best_alphas.append(alphas[exemplar_idx])
return np.sort(best_alphas), alphas, affinity[0,:], labels
def spectral(k,D, rs):
"""
From clustering_on_transcript_compatibility_counts,see github for MIT license
"""
if D[1,1] < 1: D = 1-D # Convert distance to similarity matrix
spectral = cluster.SpectralClustering(n_clusters=k,affinity='precomputed', random_state=rs)
spectral.fit(D)
labels = spectral.labels_
return labels
# gets max weight matching of a biparetite graph with row_label x column_label
# (weights are given by weight_matrix)
def spectral_clustering(S, X, config):
'''
Computes spectral clustering from an input similarity matrix.
Returns the labels associated with the clustering.
'''
from sklearn.cluster import SpectralClustering
nk = int(config["n_clusters"])
clf = SpectralClustering(affinity='cosine', n_clusters=nk)
return clf.fit_predict(X)
def compute_centroid_set(self):
INPUT_ITR = subset_iterator(
X=self.docv,
m=self.subcluster_m,
repeats=self.subcluster_repeats,
)
kn = self.subcluster_kn
clf = SpectralClustering(
n_clusters=kn,
affinity="precomputed",
)
C = []
for X in INPUT_ITR:
# Remove any rows that have zero vectors
bad_row_idx = ((X**2).sum(axis=1) == 0)
X = X[~bad_row_idx]
A = cosine_affinity(X)
# "Force" symmetry due to rounding errors
A = np.maximum( A, A.transpose() )
labels = clf.fit_predict(A)
# Compute the centroids
(N, dim) = X.shape
centroids = np.zeros((kn, dim))
for i in range(kn):
idx = labels == i
mu = X[idx].mean(axis=0)
mu /= np.linalg.norm(mu)
centroids[i] = mu
C.append(centroids)
return np.vstack(C)
def spectral_clustering_clusters(similarity_matrix):
return SpectralClustering(n_clusters=10, affinity='precomputed').fit(similarity_matrix)
def __init__(self, n_clusters=8, eigen_solver=None, random_state=None,
n_init=10, gamma=1., affinity='rbf', n_neighbors=10,
eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1,
kernel_params=None, norm_laplacian=True):
super(SpectralClustering, self).__init__(
n_clusters=n_clusters, eigen_solver=eigen_solver,
random_state=random_state, n_init=n_init, gamma=gamma,
affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol,
assign_labels=assign_labels, degree=degree, coef0=coef0,
kernel_params=kernel_params)
self.norm_laplacian = norm_laplacian
def make_spectral_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'spectral/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
spectral = SpectralClustering(n_clusters=self.spectral_clusters_count)
predict_result = spectral.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
# aa = Affinity Propagation
def SpectralAccuracy():
clusterer = SpectralClustering(n_clusters=2)
tdm = pickle.load(open(DATASET_PATH + "BOW_TDIDF.p", "rb"))
predictions = clusterer.fit_predict(tdm)
true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
numerical_mapped_1 = [0 if i == "Israeli" else 1 for i in true_labels]
numerical_mapped_2 = [1 if i == "Israeli" else 0 for i in true_labels]
one = f1_score(numerical_mapped_1, predictions)
two = f1_score(numerical_mapped_2, predictions)
print("The F1 score of Spectral Clustering on BOW (w/Tdidf) is: " + str(max(one, two)))
def split(self, node):
# Perform normalized cut
try:
ind = SpectralClustering(2, affinity = 'precomputed', assign_labels = 'discretize').fit_predict(node['affinity'])
except KeyboardInterrupt:
raise
except:
return None, None, 0
# Create left and right node
mask1, mask2 = (ind == 0), (ind == 1)
if not (np.any(mask1) and np.any(mask2)):
return None, 0
left = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 0], 'affinity' : node['affinity'][np.ix_(mask1, mask1)] }
right = { 'depth' : node['depth'] + 1, f in enumerate(node['items']) if ind[i] == 1], 'affinity' : node['affinity'][np.ix_(mask2, mask2)] }
# Force the node with the lower minimum distance to the query to be the left node
if ind[0] == 1: # items are already sorted when passed to fit(),so we just need to look at the first item instead of re-computing all distances
left, right = right, left
# Modify parent
node['children'] = [left, right]
# Modify parent chain
parent = node
while parent is not None:
parent['height'] += 1
parent['size'] += 2
parent['leafs'] += 1
parent = parent['parent']
return left, right, self.ncut_value(node['affinity'], ind)
def test_spectral_clustering():
S = np.array([[1.0, 1.0, 0.2, 0.0, 0.0],
[1.0,
[0.2, 1.0],
[0.0, 1.0]])
for eigen_solver in ('arpack', 'lobpcg'):
for assign_labels in ('kmeans', 'discretize'):
for mat in (S, sparse.csr_matrix(S)):
model = SpectralClustering(random_state=0, n_clusters=2,
affinity='precomputed',
eigen_solver=eigen_solver,
assign_labels=assign_labels
).fit(mat)
labels = model.labels_
if labels[0] == 0:
labels = 1 - labels
assert_array_equal(labels, [1, 1, 0, 0])
model_copy = loads(dumps(model))
assert_equal(model_copy.n_clusters, model.n_clusters)
assert_equal(model_copy.eigen_solver, model.eigen_solver)
assert_array_equal(model_copy.labels_, model.labels_)
def test_spectral_amg_mode():
# Test the amg mode of SpectralClustering
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
try:
from pyamg import smoothed_aggregation_solver
amg_loaded = True
except ImportError:
amg_loaded = False
if amg_loaded:
labels = spectral_clustering(S, n_clusters=len(centers),
random_state=0, eigen_solver="amg")
# We don't care too much that it's good,just that it *worked*.
# There does have to be some lower limit on the performance though.
assert_greater(np.mean(labels == true_labels), .3)
else:
assert_raises(ValueError, spectral_embedding,
n_components=len(centers),
random_state=0, eigen_solver="amg")
def test_spectral_unkNown_mode():
# Test that SpectralClustering fails with an unkNown mode set.
centers = np.array([
[0., random_state=42)
D = pairwise_distances(X) # distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
assert_raises(ValueError, spectral_clustering,
random_state=0, eigen_solver="<unkNown>")
def test_spectral_clustering_sparse():
X, y = make_blobs(n_samples=20, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
S = rbf_kernel(X, gamma=1)
S = np.maximum(S - 1e-4, 0)
S = sparse.coo_matrix(S)
labels = SpectralClustering(random_state=0,
affinity='precomputed').fit(S).labels_
assert_equal(adjusted_rand_score(y, labels), 1)
def makeSpectral(X=None, k=2):
return cluster.SpectralClustering(n_clusters=k,
eigen_solver='arpack',
affinity="nearest_neighbors")
def makeClusterers(X, k=2):
return [('MiniBatchKMeans', makeKMeans(X, k)),
('AffinityPropagation', makeAffinityProp()),
('MeanShift', makeMeanShift(X)),
('SpectralClustering', makeSpectral(X,
('Ward', makeWard(X,
('AgglomerativeAvg', makeAvgLinkage(X,
('AgglomerativeMax', makeMaxLinkage(X,
('AgglomerativeWard', makeWardLinkage(X,
('DBSCAN', makeDBScan())]