Python sklearn.cluster 模块,FeatureAgglomeration() 实例源码
我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用sklearn.cluster.FeatureAgglomeration()。
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], affinity="manhattan")[0])
def test_ward_agglomeration():
# Check that we obtain the correct solution in a simplistic case
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
agglo.fit(X)
assert_true(np.size(np.unique(agglo.labels_)) == 5)
X_red = agglo.transform(X)
assert_true(X_red.shape[1] == 5)
X_full = agglo.inverse_transform(X_red)
assert_true(np.unique(X_full[0]).size == 5)
assert_array_almost_equal(agglo.transform(X_full), X_red)
# Check that fitting with no samples raises a ValueError
assert_raises(ValueError, agglo.fit, X[:0])
def feature_agglomeration(df, number_of_clusters=int(df.shape[1] / 1.2)):
df = df.copy()
# Todo: find optimal number of clusters for the feature clustering
# number_of_clusters = int(df.shape[1]/2)
agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters)
if any(tuple(df.columns == 'Call Outcome')):
res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), df.dropna()
.shape), y=df['Call Outcome'].values)
else:
res = agglomerated_features.fit_transform(np.reshape(np.array(df.values), df.shape))
df = pd.DataFrame(data=res)
return df
def dendrogram(df, number_of_clusters=int(df.shape[1] / 1.2)):
# Create Dendrogram
agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters)
used_networks = np.arange(0, number_of_clusters, dtype=int)
# Create a custom palette to identify the networks
network_pal = sns.cubehelix_palette(len(used_networks),
light=.9, dark=.1, reverse=True,
start=1, rot=-2)
network_lut = dict(zip(map(str, df.columns), network_pal))
# Convert the palette to vectors that will be drawn on the side of the matrix
networks = df.columns.get_level_values(None)
network_colors = pd.Series(networks, index=df.columns).map(network_lut)
sns.set(font="monospace")
# Create custom colormap
cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)
cg = sns.clustermap(df.astype(float).corr(), cmap=cmap, linewidths=.5, row_colors=network_colors,
col_colors=network_colors)
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
plt.show()
def feature_agglomeration(df):
df = df.copy()
# Todo: find optimal number of clusters for the feature clustering
# number_of_clusters = int(df.shape[1]/2)
number_of_clusters = int(df.shape[1] / 1.2)
from sklearn.cluster import FeatureAgglomeration
agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters)
# mask = ~df[features].isnull()
# mask_index = mask[mask == 1].index
if any(tuple(df.columns == 'SalePrice')):
# res = agglomerated_features.fit_transform(np.reshape(np.array(df[HousePrices._feature_names_num.values]
# [mask].values),
# df[HousePrices._feature_names_num.values][mask]
# .shape),y=df.SalePrice.values).toarray()
res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), y=df.SalePrice.values)
else:
# res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values),df.dropna()
# .shape))
res = agglomerated_features.fit_transform(np.reshape(np.array(df.values), df.shape))
# Todo: in case of adding values using df.loc[],remember mask is only possible for a single feature at a time.
print(''.join(['labels:', str(agglomerated_features.labels_)]))
print(''.join(['Children:', str(agglomerated_features.children_)]))
print(''.join(['number of leaves in the hierarchical tree:', str(agglomerated_features.n_leaves_)]))
HousePrices.dendrogram(df, agglomerated_features.labels_)
df = pd.DataFrame(data=res)
return df
def data_compression(fmri_masked, mask_img, mask_np, output_size):
"""
data : array_like
A matrix of shape (`V`,`N`) with `V` voxels `N` timepoints
The functional dataset that needs to be reduced
mask : a numpy array of the mask
output_size : integer
The number of elements that the data should be reduced to
"""
## Transform nifti files to a data matrix with the NiftiMasker
import time
from nilearn import input_data
datacompressiontime=time.time()
nifti_masker = input_data.NiftiMasker(mask_img= mask_img, memory='nilearn_cache',
mask_strategy='background', memory_level=1,
standardize=False)
ward=[]
# Perform Ward clustering
from sklearn.feature_extraction import image
shape = mask_np.shape
connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1],
n_z=shape[2], mask=mask_np)
from sklearn.cluster import FeatureAgglomeration
start = time.time()
ward = FeatureAgglomeration(n_clusters=output_size, connectivity=connectivity,
linkage='ward')
ward.fit(fmri_masked)
#print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start))
labels = ward.labels_
#print ('Extracting reduced Dimension Data')
data_reduced = ward.transform(fmri_masked)
fmri_masked=[]
#print('Data compression took ',(time.time()- datacompressiontime),' seconds')
return {'data':data_reduced, 'labels':labels}