# beer dataset
import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/beer.txt'
beer = pd.read_csv(url, sep=' ')
beer
name | calories | sodium | alcohol | cost | |
---|---|---|---|---|---|
0 | Budweiser | 144 | 15 | 4.7 | 0.43 |
1 | Schlitz | 151 | 19 | 4.9 | 0.43 |
2 | Lowenbrau | 157 | 15 | 0.9 | 0.48 |
3 | Kronenbourg | 170 | 7 | 5.2 | 0.73 |
4 | Heineken | 152 | 11 | 5.0 | 0.77 |
5 | Old_Milwaukee | 145 | 23 | 4.6 | 0.28 |
6 | Augsberger | 175 | 24 | 5.5 | 0.40 |
7 | Srohs_Bohemian_Style | 149 | 27 | 4.7 | 0.42 |
8 | Miller_Lite | 99 | 10 | 4.3 | 0.43 |
9 | Budweiser_Light | 113 | 8 | 3.7 | 0.40 |
10 | Coors | 140 | 18 | 4.6 | 0.44 |
11 | Coors_Light | 102 | 15 | 4.1 | 0.46 |
12 | Michelob_Light | 135 | 11 | 4.2 | 0.50 |
13 | Becks | 150 | 19 | 4.7 | 0.76 |
14 | Kirin | 149 | 6 | 5.0 | 0.79 |
15 | Pabst_Extra_Light | 68 | 15 | 2.3 | 0.38 |
16 | Hamms | 139 | 19 | 4.4 | 0.43 |
17 | Heilemans_Old_Style | 144 | 24 | 4.9 | 0.43 |
18 | Olympia_Goled_Light | 72 | 6 | 2.9 | 0.46 |
19 | Schlitz_Light | 97 | 7 | 4.2 | 0.47 |
How would you cluster these beers?
# define X
X = beer.drop('name', axis=1)
What happened to y?
# K-means with 3 clusters
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3, random_state=1)
km.fit(X)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001, verbose=0)
# review the cluster labels
km.labels_
array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 2, 0, 0, 2, 1])
# save the cluster labels and sort by cluster
beer['cluster'] = km.labels_
beer.sort('cluster')
name | calories | sodium | alcohol | cost | cluster | |
---|---|---|---|---|---|---|
0 | Budweiser | 144 | 15 | 4.7 | 0.43 | 0 |
1 | Schlitz | 151 | 19 | 4.9 | 0.43 | 0 |
2 | Lowenbrau | 157 | 15 | 0.9 | 0.48 | 0 |
3 | Kronenbourg | 170 | 7 | 5.2 | 0.73 | 0 |
4 | Heineken | 152 | 11 | 5.0 | 0.77 | 0 |
5 | Old_Milwaukee | 145 | 23 | 4.6 | 0.28 | 0 |
6 | Augsberger | 175 | 24 | 5.5 | 0.40 | 0 |
7 | Srohs_Bohemian_Style | 149 | 27 | 4.7 | 0.42 | 0 |
17 | Heilemans_Old_Style | 144 | 24 | 4.9 | 0.43 | 0 |
16 | Hamms | 139 | 19 | 4.4 | 0.43 | 0 |
10 | Coors | 140 | 18 | 4.6 | 0.44 | 0 |
14 | Kirin | 149 | 6 | 5.0 | 0.79 | 0 |
12 | Michelob_Light | 135 | 11 | 4.2 | 0.50 | 0 |
13 | Becks | 150 | 19 | 4.7 | 0.76 | 0 |
9 | Budweiser_Light | 113 | 8 | 3.7 | 0.40 | 1 |
8 | Miller_Lite | 99 | 10 | 4.3 | 0.43 | 1 |
11 | Coors_Light | 102 | 15 | 4.1 | 0.46 | 1 |
19 | Schlitz_Light | 97 | 7 | 4.2 | 0.47 | 1 |
15 | Pabst_Extra_Light | 68 | 15 | 2.3 | 0.38 | 2 |
18 | Olympia_Goled_Light | 72 | 6 | 2.9 | 0.46 | 2 |
What do the clusters seem to be based on? Why?
# review the cluster centers
km.cluster_centers_
array([[ 150. , 17. , 4.52142857, 0.52071429], [ 102.75 , 10. , 4.075 , 0.44 ], [ 70. , 10.5 , 2.6 , 0.42 ]])
# calculate the mean of each feature for each cluster
beer.groupby('cluster').mean()
calories | sodium | alcohol | cost | |
---|---|---|---|---|
cluster | ||||
0 | 150.00 | 17.0 | 4.521429 | 0.520714 |
1 | 102.75 | 10.0 | 4.075000 | 0.440000 |
2 | 70.00 | 10.5 | 2.600000 | 0.420000 |
# save the DataFrame of cluster centers
centers = beer.groupby('cluster').mean()
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])
# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(beer.calories, beer.alcohol, c=colors[beer.cluster], s=50)
# cluster centers, marked by "+"
plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')
# add labels
plt.xlabel('calories')
plt.ylabel('alcohol')
<matplotlib.text.Text at 0x18bce240>
# scatter plot matrix (0=red, 1=green, 2=blue)
pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000000018BCEFD0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000018EE5908>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000018F06518>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019023F28>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019046A58>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019168EB8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019185908>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019326D68>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000191755F8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000194A8F60>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000194D0B70>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000195F3748>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001979B278>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000197C06D8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000198A2128>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000198C7668>]], dtype=object)
# center and scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# K-means with 3 clusters on scaled data
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_scaled)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001, verbose=0)
# save the cluster labels and sort by cluster
beer['cluster'] = km.labels_
beer.sort('cluster')
name | calories | sodium | alcohol | cost | cluster | |
---|---|---|---|---|---|---|
0 | Budweiser | 144 | 15 | 4.7 | 0.43 | 0 |
1 | Schlitz | 151 | 19 | 4.9 | 0.43 | 0 |
17 | Heilemans_Old_Style | 144 | 24 | 4.9 | 0.43 | 0 |
16 | Hamms | 139 | 19 | 4.4 | 0.43 | 0 |
5 | Old_Milwaukee | 145 | 23 | 4.6 | 0.28 | 0 |
6 | Augsberger | 175 | 24 | 5.5 | 0.40 | 0 |
7 | Srohs_Bohemian_Style | 149 | 27 | 4.7 | 0.42 | 0 |
10 | Coors | 140 | 18 | 4.6 | 0.44 | 0 |
15 | Pabst_Extra_Light | 68 | 15 | 2.3 | 0.38 | 1 |
12 | Michelob_Light | 135 | 11 | 4.2 | 0.50 | 1 |
11 | Coors_Light | 102 | 15 | 4.1 | 0.46 | 1 |
9 | Budweiser_Light | 113 | 8 | 3.7 | 0.40 | 1 |
8 | Miller_Lite | 99 | 10 | 4.3 | 0.43 | 1 |
2 | Lowenbrau | 157 | 15 | 0.9 | 0.48 | 1 |
18 | Olympia_Goled_Light | 72 | 6 | 2.9 | 0.46 | 1 |
19 | Schlitz_Light | 97 | 7 | 4.2 | 0.47 | 1 |
13 | Becks | 150 | 19 | 4.7 | 0.76 | 2 |
14 | Kirin | 149 | 6 | 5.0 | 0.79 | 2 |
4 | Heineken | 152 | 11 | 5.0 | 0.77 | 2 |
3 | Kronenbourg | 170 | 7 | 5.2 | 0.73 | 2 |
What are the "characteristics" of each cluster?
# review the cluster centers
beer.groupby('cluster').mean()
calories | sodium | alcohol | cost | |
---|---|---|---|---|
cluster | ||||
0 | 148.375 | 21.125 | 4.7875 | 0.4075 |
1 | 105.375 | 10.875 | 3.3250 | 0.4475 |
2 | 155.250 | 10.750 | 4.9750 | 0.7625 |
# scatter plot matrix of new cluster assignments (0=red, 1=green, 2=blue)
pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001A2FA9E8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A540C50>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A723710>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A74B518>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001A72F470>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A8CF860>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A9B96A0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001A9D4EF0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001AB3BF60>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AB4FCF8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001B4248D0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001B58BDA0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001B5A2FD0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001B7508D0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001B7794E0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001B8586A0>]], dtype=object)
Do you notice any cluster assignments that seem a bit odd? How might we explain those?
The Silhouette Coefficient is a common metric for evaluating clustering "performance" in situations when the "true" cluster assignments are not known.
A Silhouette Coefficient is calculated for each observation:
$$SC = \frac{b-a} {max(a, b)}$$It ranges from -1 (worst) to 1 (best). A global score is calculated by taking the mean score for all observations.
# calculate SC for K=3
from sklearn import metrics
metrics.silhouette_score(X_scaled, km.labels_)
0.4577741591090948
# calculate SC for K=2 through K=19
k_range = range(2, 20)
scores = []
for k in k_range:
km = KMeans(n_clusters=k, random_state=1)
km.fit(X_scaled)
scores.append(metrics.silhouette_score(X_scaled, km.labels_))
c:\Users\Kevin\Anaconda\lib\site-packages\numpy\core\_methods.py:59: RuntimeWarning: Mean of empty slice. warnings.warn("Mean of empty slice.", RuntimeWarning)
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)
# K-means with 4 clusters on scaled data
km = KMeans(n_clusters=4, random_state=1)
km.fit(X_scaled)
beer['cluster'] = km.labels_
beer.sort('cluster')
name | calories | sodium | alcohol | cost | cluster | |
---|---|---|---|---|---|---|
0 | Budweiser | 144 | 15 | 4.7 | 0.43 | 0 |
1 | Schlitz | 151 | 19 | 4.9 | 0.43 | 0 |
17 | Heilemans_Old_Style | 144 | 24 | 4.9 | 0.43 | 0 |
16 | Hamms | 139 | 19 | 4.4 | 0.43 | 0 |
5 | Old_Milwaukee | 145 | 23 | 4.6 | 0.28 | 0 |
6 | Augsberger | 175 | 24 | 5.5 | 0.40 | 0 |
7 | Srohs_Bohemian_Style | 149 | 27 | 4.7 | 0.42 | 0 |
10 | Coors | 140 | 18 | 4.6 | 0.44 | 0 |
15 | Pabst_Extra_Light | 68 | 15 | 2.3 | 0.38 | 1 |
12 | Michelob_Light | 135 | 11 | 4.2 | 0.50 | 1 |
11 | Coors_Light | 102 | 15 | 4.1 | 0.46 | 1 |
9 | Budweiser_Light | 113 | 8 | 3.7 | 0.40 | 1 |
8 | Miller_Lite | 99 | 10 | 4.3 | 0.43 | 1 |
18 | Olympia_Goled_Light | 72 | 6 | 2.9 | 0.46 | 1 |
19 | Schlitz_Light | 97 | 7 | 4.2 | 0.47 | 1 |
13 | Becks | 150 | 19 | 4.7 | 0.76 | 2 |
14 | Kirin | 149 | 6 | 5.0 | 0.79 | 2 |
4 | Heineken | 152 | 11 | 5.0 | 0.77 | 2 |
3 | Kronenbourg | 170 | 7 | 5.2 | 0.73 | 2 |
2 | Lowenbrau | 157 | 15 | 0.9 | 0.48 | 3 |
# DBSCAN with eps=1 and min_samples=3
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=1, min_samples=3)
db.fit(X_scaled)
DBSCAN(algorithm='auto', eps=1, leaf_size=30, metric='euclidean', min_samples=3, p=None, random_state=None)
# review the cluster labels
db.labels_
array([ 0, 0, -1, 1, 1, -1, -1, 0, 2, 2, 0, 2, 0, -1, 1, -1, 0, 0, -1, 2], dtype=int64)
# save the cluster labels and sort by cluster
beer['cluster'] = db.labels_
beer.sort('cluster')
name | calories | sodium | alcohol | cost | cluster | |
---|---|---|---|---|---|---|
2 | Lowenbrau | 157 | 15 | 0.9 | 0.48 | -1 |
5 | Old_Milwaukee | 145 | 23 | 4.6 | 0.28 | -1 |
6 | Augsberger | 175 | 24 | 5.5 | 0.40 | -1 |
18 | Olympia_Goled_Light | 72 | 6 | 2.9 | 0.46 | -1 |
13 | Becks | 150 | 19 | 4.7 | 0.76 | -1 |
15 | Pabst_Extra_Light | 68 | 15 | 2.3 | 0.38 | -1 |
0 | Budweiser | 144 | 15 | 4.7 | 0.43 | 0 |
1 | Schlitz | 151 | 19 | 4.9 | 0.43 | 0 |
7 | Srohs_Bohemian_Style | 149 | 27 | 4.7 | 0.42 | 0 |
17 | Heilemans_Old_Style | 144 | 24 | 4.9 | 0.43 | 0 |
10 | Coors | 140 | 18 | 4.6 | 0.44 | 0 |
16 | Hamms | 139 | 19 | 4.4 | 0.43 | 0 |
12 | Michelob_Light | 135 | 11 | 4.2 | 0.50 | 0 |
3 | Kronenbourg | 170 | 7 | 5.2 | 0.73 | 1 |
4 | Heineken | 152 | 11 | 5.0 | 0.77 | 1 |
14 | Kirin | 149 | 6 | 5.0 | 0.79 | 1 |
9 | Budweiser_Light | 113 | 8 | 3.7 | 0.40 | 2 |
8 | Miller_Lite | 99 | 10 | 4.3 | 0.43 | 2 |
11 | Coors_Light | 102 | 15 | 4.1 | 0.46 | 2 |
19 | Schlitz_Light | 97 | 7 | 4.2 | 0.47 | 2 |
# review the cluster centers
beer.groupby('cluster').mean()
calories | sodium | alcohol | cost | |
---|---|---|---|---|
cluster | ||||
-1 | 127.833333 | 17 | 3.483333 | 0.460000 |
0 | 143.142857 | 19 | 4.628571 | 0.440000 |
1 | 157.000000 | 8 | 5.066667 | 0.763333 |
2 | 102.750000 | 10 | 4.075000 | 0.440000 |
# scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow)
pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001C074320>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001C19C828>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EDA2D30>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EDC8E48>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001EEEC208>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EF0F978>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EF1D748>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F092F98>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F1FF4A8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F21E2E8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F303710>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F46B6A0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F48B668>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F5F8780>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F495208>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F73D748>]], dtype=object)