Python sklearn.cluster 模块,MiniBatchKMeans() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cluster.MiniBatchKMeans()。
def palettise(data, n_entries=256):
height = len(data)
width = len(data[0])
all_colours = sum(data, [])
print("Calculating pallete...")
kmeans = MiniBatchKMeans(n_clusters=n_entries, random_state=0).fit(all_colours)
pallete = [list(map(int, rgb)) for rgb in kmeans.cluster_centers_]
print("Dithering...") # Floyd–Steinberg dithering
for y in range(height):
print("\r{:.1f}%".format((y/height)*100), end="")
for x in range(width):
bucket = kmeans.predict([data[y][x]])[0]
error = [a-b for a, b in zip(data[y][x], pallete[bucket])]
data[y][x] = bucket
for dx, dy, coef in [(1, 0, 7/16), (-1, 1, 3/16), (0, 5/16), (1, 1/16)]:
xn = x + dx
yn = y + dy
if ( 0 <= xn < width and 0 <= yn < height ):
data[yn][xn] = [a+b*coef for a, b in zip(data[yn][xn], error)]
print("\r100% ")
return data, pallete
def k_means(self, n_clusters, batch_size=1000):
"""
Perform K-mean clustering
Parameters
----------
n_clusters : int
number of clusters
batch_size : int
the bath size for the MiniBatchKMeans algorithm
"""
from sklearn.cluster import MiniBatchKMeans
pars = {"batch_size": batch_size, 'is_hierarchical': False,
"metric": self.metric}
km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
n_init=10,
init_size=batch_size, batch_size=batch_size)
return self._cluster_func(n_clusters, km, pars)
def kmeans_aic(model, X, **kwargs):
'''AIC (Akaike information Criterion) for k-means for model selection
Parameters:
:model: An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline
:X: The X data that were just given to "fit",or "partial_fit"
:kwargs: placeholder - ignored
Returns:
:AIC: float
'''
k, m = model._estimator.cluster_centers_.shape
if isinstance(X, xr.DataArray):
n = X.flat.values.shape[0]
else:
n = X.shape[0]
d = model._estimator.inertia_
aic = d + 2 * m * k
delattr(model._estimator, 'labels_')
return aic
def test_clusterer_enforcement(self):
"""
Assert that only clustering estimators can be passed to cluster viz
"""
nomodels = [
SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
]
for nomodel in nomodels:
with self.assertRaises(YellowbrickTypeError):
visualizer = ClusteringscoreVisualizer(nomodel())
models = [
KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
]
for model in models:
try:
visualizer = ClusteringscoreVisualizer(model())
except YellowbrickTypeError:
self.fail("Could not pass clustering estimator to visualizer")
def avg_within_ss(X, k):
"""
Compute the average within-cluster sum of squares. The code here can be
found "almost" anywhere online
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
avgwithinss: average within-cluster sum of squares
"""
model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
n_init=3, max_no_improvement=10, verbose=0)
model.fit(X)
centroids = model.cluster_centers_
dist_c = cdist(X, centroids, 'euclidean')
dist = np.min(dist_c, axis=1)
avgwithinss = sum(dist**2)/X.shape[0]
return avgwithinss
def fit(self, descs, MiniBatchKMeans=True, batch_size=10000, preprocess=False):
"""Training"""
"""
if preprocess:
self.stdSlr = StandardScaler().fit(descs)
descs = self.stdSlr.transform(descs)
else:
self.stdSlr = None
"""
if MiniBatchKMeans:
self.centers = self.MiniBatchKMeans(descs, batch_size)
else:
self.centers = self.Kmeans(descs)
if preprocess:
self.stdSlr = StandardScaler().fit(descs)
#descs = self.stdSlr.transform(descs)
else:
self.stdSlr = None
return self.centers
def test_k_means_explicit_init_shape():
# test for sensible errors when giving explicit init
# with wrong number of features or clusters
rnd = np.random.RandomState(0)
X = rnd.normal(size=(40, 3))
for Class in [KMeans, MiniBatchKMeans]:
# mismatch of number of features
km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
msg = "does not match the number of features of the data"
assert_raises_regex(ValueError, msg, km.fit, X)
# for callable init
km = Class(n_init=1, init=lambda X_, k, random_state: X_[:, n_clusters=len(X))
assert_raises_regex(ValueError, X)
# mismatch of number of clusters
msg = "does not match the number of clusters"
km = Class(n_init=1, init=X[:2, :], n_clusters=3)
assert_raises_regex(ValueError, random_state: X_[:2, X)
def test_minibatch_sensible_reassign_fit():
# check if identical initial clusters are reassigned
# also a regression test for when there are more desired reassignments than
# samples.
zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
cluster_std=1., random_state=42)
zeroed_X[::2, :] = 0
mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
init="random")
mb_k_means.fit(zeroed_X)
# there should not be too many exact zero cluster centers
assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
# do the same with batch-size > X.shape[0] (regression test)
mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
random_state=42, init="random")
mb_k_means.fit(zeroed_X)
# there should not be too many exact zero cluster centers
assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
def test_sparse_mb_k_means_callable_init():
def test_init(X, random_state):
return centers
# Small test to check that giving the wrong number of centers
# raises a meaningful error
msg = "does not match the number of clusters"
assert_raises_regex(ValueError, MiniBatchKMeans(init=test_init,
random_state=42).fit,
X_csr)
# Now check that the fit actually works
mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
random_state=42).fit(X_csr)
_check_fitted_model(mb_k_means)
def test_mini_batch_k_means_random_init_partial_fit():
km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)
# use the partial_fit API for online learning
for X_minibatch in np.array_split(X, 10):
km.partial_fit(X_minibatch)
# compute the labeling on the complete dataset
labels = km.predict(X)
assert_equal(v_measure_score(true_labels, labels), 1.0)
def test__ClusteringWithSupervision_clusters():
"""
Check that we change the number of clusters properly.
We have a weird interface here where we sort of overload
`n_clusters` but try to hide it.
"""
train, classes = make_X_y()
model = ClusteringWithSupervision(cluster_instance=MiniBatchKMeans())
assert model.n_clusters is None
assert model.get_params()['n_clusters'] is None
assert model.cluster_instance.n_clusters == 8
assert model._cluster_instance is None
model.fit(train, classes)
assert model.n_clusters is None
assert model.get_params()['n_clusters'] is None
assert model.cluster_instance.n_clusters == 8
assert model._cluster_instance.n_clusters == 4
def bow_codebook(data, K=64):
km = MiniBatchKMeans(n_clusters=K,
compute_labels=False, batch_size=1000, max_iter=150, max_no_improvement=30,
verbose=False).fit(data)
return km.cluster_centers_
def test_basic(self, single_chunk_blobs):
X, y = single_chunk_blobs
a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0)
b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0)
a.fit(X)
b.partial_fit(X)
assert_estimator_equal(a, b, exclude=['random_state_'])
def mini_batch(fig):
global X_iris, geo
ax = fig.add_subplot(geo + 2, projection='3d', title='mini-batch')
mini_batch = cluster.MiniBatchKMeans(init='random', n_clusters=3)
mini_batch.fit(X_iris)
res = mini_batch.labels_
for n, i in enumerate(X_iris):
ax.scatter(*i[: 3], c='bgrcmyk'[res[n] % 7], marker='o')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
return res
def train(X, y, true_k=50, minibatch=False, showLable=True):
# ??????????????k-means?
fout = open('pro1_cluster.txt', 'w+')
if minibatch:
km = MiniBatchKMeans(n_clusters=true_k, n_init=1,
init_size=1000, verbose=False)
else:
km = KMeans(n_clusters=true_k, max_iter=300,
verbose=False)
km.fit(X)
print y.dtype
if showLable:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = y
# print y
result = list(km.predict(X))
print('Cluster distribution:')
print(dict([(i, result.count(i)) for i in result]))
cluster_list = {}
for i in range(true_k):
cluster_list[i] = []
for j in range(len(result)):
# print terms[j]
# print result[j]
cluster_list[result[j]].append([terms[j], X[j]])
for i in cluster_list.keys():
cluster = cluster_list[i]
if len(cluster) > 0:
for bet in cluster:
vec = bet[1].tolist()
# fout.write(bet[0] + str(vec) + '\n')
# print bet
fout.write(bet[0] + '\n')
fout.write('-------------------\n')
return -km.score(X)
fout.close()
def minibatchs_k_means_clustering(self, out_path, pd_data, number_of_clusters):
headers, repos, features = self.__fetch_data(pd_data)
mb_kmeans = MiniBatchKMeans(n_clusters=number_of_clusters)
mb_kmeans.fit(features)
clusters = []
for i in range(0, number_of_clusters): # k cluster
repo_list = []
for j in range (0, len(mb_kmeans.labels_)): # a label for each repo.
if i == mb_kmeans.labels_[j]: # if repo label is equal to Cluster number
repo_list.append(repos[j]) # add repo to cluster i's list.
clusters.append(repo_list)
out_file_path = os.path.join(out_path, "mb_kmeans_noOfClusters" + str(number_of_clusters))
self.__export_k_means_results(mb_kmeans, headers, clusters, out_file_path) # avoid ".csv"
def __init__(self, n_codewords, normalization=3, inner_batch=128,
dimension_ordering="tf"):
self.n_codewords = n_codewords
self.inner_batch = inner_batch
self.normalization = normalization
self._clusterer = cluster.MiniBatchKMeans(
n_clusters=self.n_codewords,
n_init=1,
compute_labels=False
)
super(self.__class__, self).__init__(dimension_ordering)
def initial_centers(self, img_output):
C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
print "#DVSQ train# initilizing Centers"
all_output = img_output
for i in xrange(self.subspace_num):
kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
print "step: ", i, " finish"
return C_init
def KMEANS(data, k):
if data.shape[0] < 20000:
centroids, cluster_IDs, _ = k_means(data, init = 'k-means++', precompute_distances = 'auto', n_init = 20, max_iter = 200)
else:
mbkm = MiniBatchKMeans(k, 'k-means++', max_iter = 100, batch_size = data.shape[0] / k, n_init = 20)
mbkm.fit(data)
centroids = mbkm.cluster_centers_
cluster_IDs = mbkm.labels_
return centroids, cluster_IDs
def make_example_y_data(X, y=None, sample_weight=None, **kwargs):
fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values)
y = fitted.predict(X.flat.values)
return (X, sample_weight)
def initial_centers(self, img_output, txt_output):
C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
print "#cdq train# initilizing Centers"
all_output = np.vstack([img_output, txt_output])
for i in xrange(self.subspace_num):
kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, " finish"
return C_init
def _kmeans_clustering(self, batch_size=128):
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size,
n_init=10, max_no_improvement=10)
kmeans.fit(X)
return kmeans.cluster_centers_
def cluster(self):
mbk = MiniBatchKMeans(n_clusters=self.K, batch_size=self.K*2, verbose=self.verbose, compute_labels=False)
if self.subsample is None:
data = np.vstack([self.data[k] for k in self.data.keys() if self.data[k] is not None])
mbk.fit(data)
else: # sample number of files
fnames = self.data.keys()
subset = random.sample(fnames, int(self.subsample * len(fnames)))
subdata = np.vstack([self.data[k] for k in subset if self.data[k] is not None])
mbk.fit(subdata)
return mbk.cluster_centers_
def test_kmeans(self):
n_clusters = 5
X, y = make_blobs(n_samples=1000, centers=n_clusters, random_state=0)
centers, labels = utils.kmeans(X, n_clusters)
clf = MiniBatchKMeans(n_clusters=n_clusters)
assert len(labels) == len(clf.fit_predict(X))
assert len(centers) == len(clf.cluster_centers_)
def kmeans(X, **kwargs):
"""Classify vectors in X using K-Means algorithm with n_clusters.
Arguments in kwargs are passed to scikit-learn MiniBatchKMeans.
Returns a tuple of cluster centers and predicted labels."""
clf = MiniBatchKMeans(n_clusters=n_clusters, **kwargs)
labels = clf.fit_predict(X)
centers = clf.cluster_centers_.astype(np.ubyte)
return centers, labels
def kmeans_classifier(prediction, ticket_predict_weights, ticket_target_list, tickets_to_weights_matrix):
kmeans = MiniBatchKMeans(n_clusters=len(ticket_target_list), init_size=len(tickets_to_weights_matrix) + 1)
kmeans.fit(tickets_to_weights_matrix)
predicted_class = kmeans.predict(ticket_predict_weights)[0]
print "kmeans prediction: {}".format(ticket_target_list[predicted_class])
if prediction is not None:
prediction.append([ticket_target_list[predicted_class]])
def test_integrated_mini_batch_kmeans_elbow(self):
"""
Test no exceptions for mini-batch kmeans k-elbow visualizer
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X,y = make_blobs(
n_samples=1000, n_features=12, centers=6, shuffle=True
)
try:
visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4)
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during k-elbow: {}".format(e))
def test_integrated_mini_batch_kmeans_silhouette(self):
"""
Test no exceptions for mini-batch kmeans silhouette visualizer
See #182: cannot use occupancy dataset because of memory usage
"""
# Generate a blobs data set
X, y = make_blobs(
n_samples=1000, centers=8, shuffle=True,
)
try:
visualizer = SilhouetteVisualizer(MiniBatchKMeans())
visualizer.fit(X)
visualizer.poof()
except Exception as e:
self.fail("error during silhouette: {}".format(e))
def perc_var_explained(X,k):
"""
Compute the percentage of variance explained defined as between sum of squares
divided but the total sum of squares.
WARNING: It will take a while.
The code here can be found "almost" anywhere online.
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
pve: percentage of variance explained
"""
model = MiniBatchKMeans(init='k-means++', axis=1)
tot_withinss = sum(dist**2)
totss = sum(pdist(X)**2)/X.shape[0]
betweenss = totss - tot_withinss
pve = (betweenss/totss *100)
return pve
def bic(X, k):
"""
Compute the BIC score.
Implementarion from here:
http://www.aladdin.cs.cmu.edu/papers/pdfs/y2000/xmeans.pdf
with corrections from here:
https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
BIC: bic score
"""
model = MiniBatchKMeans(init='k-means++', verbose=0)
model.fit(X)
centers = model.cluster_centers_
centers = np.expand_dims(centers, axis=1)
labels = model.labels_
N_C = np.bincount(labels)
R, M = X.shape
wcss = sum([sum(cdist(X[np.where(labels == c)], centers[c], 'euclidean')**2) for c in range(k)])
var = (1.0/(R-k)/M) * wcss
const_term = 0.5 * k * np.log(R) * (M+1)
BIC = np.sum([ ( Rn * np.log(Rn) ) -
( Rn * np.log(R) ) -
( ((Rn * M) / 2) * np.log(2*np.pi*var) ) -
( (Rn - 1) * M/ 2 )
for Rn in N_C]) - const_term
return BIC
def gen_cluster(keys = None, cluster_matrix = None):
km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
# km = KMeans(n_jobs=-1,n_clusters=50)
print "Clustering data..."
labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))
res = pd.concat([keys, labels], axis = 1, ignore_index=True)
return res
def gen_cluster(keys = None, cluster_matrix = None):
assert cluster_matrix and keys
km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))
res = pd.concat([keys, ignore_index=True)
return res
def fit(self, preprocess=True):
if preprocess:
self.stdSlr = StandardScaler()
self.stdSlr.fit(descs)
tmp = self.stdSlr.transform(descs)
else:
tmp = descs
self.stdSlr = None
kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.num_clusters, batch_size=10000)
kmeans.fit(tmp)
self.centers = kmeans.cluster_centers_
self.clusters = kmeans.labels_
return self.centers
def MiniBatchKMeans(self, batch=10000):
print("in fit method", X.shape, self.k)
kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.k, batch_size=batch)
kmeans.fit(X)
centers = kmeans.cluster_centers_
clusters = kmeans.labels_
print("shape of centers is ", centers.shape)
return centers
def initial_centers(self, self.output_dim])
print "#ZDQ train# initilizing Centers"
all_output = img_output
for i in xrange(self.subspace_num):
kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, " finish"
return C_init
def test_mb_k_means_plus_plus_init_dense_array():
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
random_state=42)
mb_k_means.fit(X)
_check_fitted_model(mb_k_means)
def test_mb_kmeans_verbose():
mb_k_means = MiniBatchKMeans(init="k-means++", verbose=1)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
mb_k_means.fit(X)
finally:
sys.stdout = old_stdout
def test_mb_k_means_plus_plus_init_sparse_matrix():
mb_k_means = MiniBatchKMeans(init="k-means++",
random_state=42)
mb_k_means.fit(X_csr)
_check_fitted_model(mb_k_means)
def test_minibatch_init_with_large_k():
mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
# Check that a warning is raised,as the number clusters is larger
# than the init_size
assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_minibatch_k_means_random_init_sparse_csr():
# increase n_init to make random init stable enough
mb_k_means = MiniBatchKMeans(init="random", n_init=10).fit(X_csr)
_check_fitted_model(mb_k_means)
def test_minibatch_k_means_perfect_init_dense_array():
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_init=1).fit(X)
_check_fitted_model(mb_k_means)
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_init=10)
assert_warns(RuntimeWarning, X)
def test_minibatch_k_means_perfect_init_sparse_csr():
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_init=1).fit(X_csr)
_check_fitted_model(mb_k_means)
def test_minibatch_with_many_reassignments():
# Test for the case that the number of clusters to reassign is bigger
# than the batch_size
n_samples = 550
rnd = np.random.RandomState(42)
X = rnd.uniform(size=(n_samples, 10))
# Check that the fit works if n_clusters is bigger than the batch_size.
# Run the test with 550 clusters and 550 samples,because it turned out
# that this values ensure that the number of clusters to reassign
# is always bigger than the batch_size
n_clusters = 550
MiniBatchKMeans(n_clusters=n_clusters,
batch_size=100,
init_size=n_samples,
random_state=42).fit(X)