아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.jupyter.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.
주피터 노트북 뷰어로 보기 | 구글 코랩(Colab)에서 실행하기 |
from IPython.display import Image
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1],
c='white', marker='o', edgecolor='black', s=50)
plt.grid()
plt.tight_layout()
# plt.savefig('images/11_01.png', dpi=300)
plt.show()
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3,
init='random',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
plt.scatter(X[y_km == 0, 0],
X[y_km == 0, 1],
s=50, c='lightgreen',
marker='s', edgecolor='black',
label='Cluster 1')
plt.scatter(X[y_km == 1, 0],
X[y_km == 1, 1],
s=50, c='orange',
marker='o', edgecolor='black',
label='Cluster 2')
plt.scatter(X[y_km == 2, 0],
X[y_km == 2, 1],
s=50, c='lightblue',
marker='v', edgecolor='black',
label='Cluster 3')
plt.scatter(km.cluster_centers_[:, 0],
km.cluster_centers_[:, 1],
s=250, marker='*',
c='red', edgecolor='black',
label='Centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
# plt.savefig('images/11_02.png', dpi=300)
plt.show()
...
...
print('왜곡: %.2f' % km.inertia_)
왜곡: 72.48
distortions = []
for i in range(1, 11):
km = KMeans(n_clusters=i,
init='k-means++',
n_init=10,
max_iter=300,
random_state=0)
km.fit(X)
distortions.append(km.inertia_)
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
# plt.savefig('images/11_03.png', dpi=300)
plt.show()
import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples
km = KMeans(n_clusters=3,
init='k-means++',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km == c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0,
edgecolor='none', color=color)
yticks.append((y_ax_lower + y_ax_upper) / 2.)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
# plt.savefig('images/11_04.png', dpi=300)
plt.show()
잘못된 클러스터링:
km = KMeans(n_clusters=2,
init='k-means++',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
plt.scatter(X[y_km == 0, 0],
X[y_km == 0, 1],
s=50,
c='lightgreen',
edgecolor='black',
marker='s',
label='Cluster 1')
plt.scatter(X[y_km == 1, 0],
X[y_km == 1, 1],
s=50,
c='orange',
edgecolor='black',
marker='o',
label='Cluster 2')
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
s=250, marker='*', c='red', label='Centroids')
plt.legend()
plt.grid()
plt.tight_layout()
# plt.savefig('images/11_05.png', dpi=300)
plt.show()
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km == c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0,
edgecolor='none', color=color)
yticks.append((y_ax_lower + y_ax_upper) / 2.)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
# plt.savefig('images/11_06.png', dpi=300)
plt.show()
Image(url='https://git.io/JLdsq', width=400)
import pandas as pd
import numpy as np
np.random.seed(123)
variables = ['X', 'Y', 'Z']
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']
X = np.random.random_sample([5, 3])*10
df = pd.DataFrame(X, columns=variables, index=labels)
df
X | Y | Z | |
---|---|---|---|
ID_0 | 6.964692 | 2.861393 | 2.268515 |
ID_1 | 5.513148 | 7.194690 | 4.231065 |
ID_2 | 9.807642 | 6.848297 | 4.809319 |
ID_3 | 3.921175 | 3.431780 | 7.290497 |
ID_4 | 4.385722 | 0.596779 | 3.980443 |
from scipy.spatial.distance import pdist, squareform
row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')),
columns=labels,
index=labels)
row_dist
ID_0 | ID_1 | ID_2 | ID_3 | ID_4 | |
---|---|---|---|---|---|
ID_0 | 0.000000 | 4.973534 | 5.516653 | 5.899885 | 3.835396 |
ID_1 | 4.973534 | 0.000000 | 4.347073 | 5.104311 | 6.698233 |
ID_2 | 5.516653 | 4.347073 | 0.000000 | 7.244262 | 8.316594 |
ID_3 | 5.899885 | 5.104311 | 7.244262 | 0.000000 | 4.382864 |
ID_4 | 3.835396 | 6.698233 | 8.316594 | 4.382864 | 0.000000 |
함수 설명을 보면 pdist
함수에서 계산한 축약된 거리 행렬(상삼각행렬(upper triangular matrix))을 입력 속성으로 사용할 수 있습니다. 아니면 linkage
함수에 초기 데이터 배열을 전달하고 metric='euclidean'
지표를 매개변수로 사용할 수 있습니다. 앞서 squareform
함수로 만든 거리 행렬은 linkage
함수가 기대한 값과 다르기 때문에 사용해서는 안됩니다.
# 1. 잘못된 방식: squareform 거리 행렬
from scipy.cluster.hierarchy import linkage
row_clusters = linkage(row_dist, method='complete', metric='euclidean')
pd.DataFrame(row_clusters,
columns=['row label 1', 'row label 2',
'distance', 'no. of items in clust.'],
index=['cluster %d' % (i + 1)
for i in range(row_clusters.shape[0])])
<ipython-input-14-9dd0eba25b7e>:5: ClusterWarning: scipy.cluster: The symmetric non-negative hollow observation matrix looks suspiciously like an uncondensed distance matrix row_clusters = linkage(row_dist, method='complete', metric='euclidean')
row label 1 | row label 2 | distance | no. of items in clust. | |
---|---|---|---|---|
cluster 1 | 0.0 | 4.0 | 6.521973 | 2.0 |
cluster 2 | 1.0 | 2.0 | 6.729603 | 2.0 |
cluster 3 | 3.0 | 5.0 | 8.539247 | 3.0 |
cluster 4 | 6.0 | 7.0 | 12.444824 | 5.0 |
# 2. 올바른 방식: 축약된 거리 행렬
row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')
pd.DataFrame(row_clusters,
columns=['row label 1', 'row label 2',
'distance', 'no. of items in clust.'],
index=['cluster %d' % (i + 1)
for i in range(row_clusters.shape[0])])
row label 1 | row label 2 | distance | no. of items in clust. | |
---|---|---|---|---|
cluster 1 | 0.0 | 4.0 | 3.835396 | 2.0 |
cluster 2 | 1.0 | 2.0 | 4.347073 | 2.0 |
cluster 3 | 3.0 | 5.0 | 5.899885 | 3.0 |
cluster 4 | 6.0 | 7.0 | 8.316594 | 5.0 |
# 3. 올바른 방식: 입력 샘플 행렬
row_clusters = linkage(df.values, method='complete', metric='euclidean')
pd.DataFrame(row_clusters,
columns=['row label 1', 'row label 2',
'distance', 'no. of items in clust.'],
index=['cluster %d' % (i + 1)
for i in range(row_clusters.shape[0])])
row label 1 | row label 2 | distance | no. of items in clust. | |
---|---|---|---|---|
cluster 1 | 0.0 | 4.0 | 3.835396 | 2.0 |
cluster 2 | 1.0 | 2.0 | 4.347073 | 2.0 |
cluster 3 | 3.0 | 5.0 | 5.899885 | 3.0 |
cluster 4 | 6.0 | 7.0 | 8.316594 | 5.0 |
from scipy.cluster.hierarchy import dendrogram
# 검은색 덴드로그램 만들기 (1/2 부분만)
# from scipy.cluster.hierarchy import set_link_color_palette
# set_link_color_palette(['black'])
row_dendr = dendrogram(row_clusters,
labels=labels,
# make dendrogram black (part 2/2)
# color_threshold=np.inf
)
plt.tight_layout()
plt.ylabel('Euclidean distance')
# plt.savefig('images/11_11.png', dpi=300,
# bbox_inches='tight')
plt.show()
fig = plt.figure(figsize=(8, 8), facecolor='white')
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])
# 노트: matplotlib < v1.5.1일 때는 use orientation='right'를 사용하세요
row_dendr = dendrogram(row_clusters, orientation='left')
# 군집에 맞게 데이터를 재정렬합니다.
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]
axd.set_xticks([])
axd.set_yticks([])
# 덴드로그램의 축을 제거합니다.
for i in axd.spines.values():
i.set_visible(False)
# 히트맵을 출력합니다.
axm = fig.add_axes([0.23, 0.1, 0.6, 0.6]) # x-위치, y-위치, 너비, 높이
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))
# plt.savefig('images/11_12.png', dpi=300)
plt.show()
<ipython-input-18-eb84f823cd67>:21: UserWarning: FixedFormatter should only be used together with FixedLocator axm.set_xticklabels([''] + list(df_rowclust.columns)) <ipython-input-18-eb84f823cd67>:22: UserWarning: FixedFormatter should only be used together with FixedLocator axm.set_yticklabels([''] + list(df_rowclust.index))
from sklearn.cluster import AgglomerativeClustering
# 사이킷런 1.2버전에서 `affinity` 매개변수가 deprecated되고
# 1.4버전에서 삭제되므로 대신 `metric`을 사용합니다.
ac = AgglomerativeClustering(n_clusters=3,
metric='euclidean',
linkage='complete')
labels = ac.fit_predict(X)
print('클러스터 레이블: %s' % labels)
클러스터 레이블: [1 0 0 2 1]
# 사이킷런 1.2버전에서 `affinity` 매개변수가 deprecated되고
# 1.4버전에서 삭제되므로 대신 `metric`을 사용합니다.
ac = AgglomerativeClustering(n_clusters=2,
metric='euclidean',
linkage='complete')
labels = ac.fit_predict(X)
print('클러스터 레이블: %s' % labels)
클러스터 레이블: [0 1 1 0 0]
Image(url='https://git.io/JLdsY', width=500)
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
plt.scatter(X[:, 0], X[:, 1])
plt.tight_layout()
# plt.savefig('images/11_14.png', dpi=300)
plt.show()
K-평균과 계층 군집:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
km = KMeans(n_clusters=2, n_init=10, random_state=0)
y_km = km.fit_predict(X)
ax1.scatter(X[y_km == 0, 0], X[y_km == 0, 1],
edgecolor='black',
c='lightblue', marker='o', s=40, label='cluster 1')
ax1.scatter(X[y_km == 1, 0], X[y_km == 1, 1],
edgecolor='black',
c='red', marker='s', s=40, label='cluster 2')
ax1.set_title('K-means clustering')
# 사이킷런 1.2버전에서 `affinity` 매개변수가 deprecated되고
# 1.4버전에서 삭제되므로 대신 `metric`을 사용합니다.
ac = AgglomerativeClustering(n_clusters=2,
metric='euclidean',
linkage='complete')
y_ac = ac.fit_predict(X)
ax2.scatter(X[y_ac == 0, 0], X[y_ac == 0, 1], c='lightblue',
edgecolor='black',
marker='o', s=40, label='Cluster 1')
ax2.scatter(X[y_ac == 1, 0], X[y_ac == 1, 1], c='red',
edgecolor='black',
marker='s', s=40, label='Cluster 2')
ax2.set_title('Agglomerative clustering')
plt.legend()
plt.tight_layout()
# plt.savefig('images/11_15.png', dpi=300)
plt.show()
DBSCAN:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
y_db = db.fit_predict(X)
plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1],
c='lightblue', marker='o', s=40,
edgecolor='black',
label='Cluster 1')
plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1],
c='red', marker='s', s=40,
edgecolor='black',
label='Cluster 2')
plt.legend()
plt.tight_layout()
# plt.savefig('images/11_16.png', dpi=300)
plt.show()
...