返回检测到的簇的大小

问题描述

我在高维数据库中应用了平分kMeans聚类,并想要显示派生的聚类组的大小,例如集群1 = 2000个元素;簇2 = 3489个元素,依此类推。

我需要哪些功能才能显示尺寸?可视化是可能的,如在二维图上所示:

Cluster size

def函数如下所示:

def convert_to_2d_array(points):
    """
    Converts `points` to a 2-D numpy array.
    """
    points = np.array(points)
    if len(points.shape) == 1:
        points = np.expand_dims(points,-1)
    return points

def visualize_clusters(clusters):
    """
    Visualizes the first 2 dimensions of the data as a 2-D scatter plot.
    """
    plt.figure()
    for cluster in clusters:
        points = convert_to_2d_array(cluster)
        if points.shape[1] < 2:
            points = np.hstack([points,np.zeros_like(points)])
        plt.plot(points[:,0],points[:,1],'o')
    plt.show()
    
def SSE(points):
    """
    Calculates the sum of squared errors for the given list of data points.
    """
    points = convert_to_2d_array(points)
    centroid = np.mean(points,0)
    errors = np.linalg.norm(points-centroid,ord=2,axis=1)
    return np.sum(errors)

def kmeans(points,k=2,epochs=10,max_iter=100,verbose=False):
    """
    Clusters the list of points into `k` clusters using k-means clustering
    algorithm.
    """
    points = convert_to_2d_array(points)
    assert len(points) >= k,"Number of data points can't be less than k"
    
    best_sse = np.inf

    for ep in range(epochs):
        # Randomly initialize k centroids
        np.random.shuffle(points)
        centroids = points[0:k,:]
        
        last_sse = np.inf

        for it in range(max_iter):
            # Cluster assignment
            clusters = [None] * k
            for p in points:
                index = np.argmin(np.linalg.norm(centroids-p,2,1))
                if clusters[index] is None:
                    clusters[index] = np.expand_dims(p,0)
                else:
                    clusters[index] = np.vstack((clusters[index],p))
                    
            # Centroid update
            centroids = [np.mean(c,0) for c in clusters]
            
            # SSE calculation
            sse = np.sum([SSE(c) for c in clusters])
            gain = last_sse - sse
            if verbose:
                print((f'Epoch: {ep:3d},Iter: {it:4d},'
                       f'SSE: {sse:12.4f},Gain: {gain:12.4f}'))
                
            # Check for improvement
            if sse < best_sse:
                best_clusters,best_sse = clusters,sse
            
            # Epoch termination condition
            if np.isclose(gain,atol=0.00001):
                break
            last_sse = sse
            return best_clusters

def bisecting_kmeans(points,verbose=False):
    """
    Clusters the list of points into `k` clusters using bisecting k-means
    clustering algorithm. Internally,it uses the standard k-means with k=2 in
    each iteration.
    """
    points = convert_to_2d_array(points)
    clusters = [points]
    while len(clusters) < k:
        max_sse_i = np.argmax([SSE(c) for c in clusters])
        cluster = clusters.pop(max_sse_i)
        two_clusters = kmeans(
            cluster,epochs=epochs,max_iter=max_iter,verbose=verbose)
        clusters.extend(two_clusters)
    return clusters

在此先感谢您的帮助!

最诚挚的问候,

疲劳

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)