Programming Exercise 7 - K-means Clustering and Principal Component Analysis

In [1]:
# %load ../../../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from scipy.io import loadmat
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import linalg

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

K-means on example dataset

In [2]:
data1 = loadmat('data/ex7data2.mat')
data1.keys()
Out[2]:
dict_keys(['__header__', 'X', '__globals__', '__version__'])
In [3]:
X1 = data1['X']
print('X1:', X1.shape)
X1: (300, 2)
In [4]:
km1 = KMeans(3)
km1.fit(X1)
Out[4]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
In [5]:
plt.scatter(X1[:,0], X1[:,1], s=40, c=km1.labels_, cmap=plt.cm.prism) 
plt.title('K-Means Clustering Results with K=3')
plt.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2);

Image compression with K-means

In [6]:
img = plt.imread('data/bird_small.png')
img_shape = img.shape
img_shape
Out[6]:
(128, 128, 3)
In [7]:
A = img/255
In [8]:
AA = A.reshape(128*128,3)
AA.shape
Out[8]:
(16384, 3)
In [9]:
km2 = KMeans(16)
km2.fit(AA)
Out[9]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=16, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
In [10]:
B = km2.cluster_centers_[km2.labels_].reshape(img_shape[0], img_shape[1], 3)
In [11]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(13,9))
ax1.imshow(img)
ax1.set_title('Original')
ax2.imshow(B*255)
ax2.set_title('Compressed, with 16 colors')

for ax in fig.axes:
    ax.axis('off')

PCA on example data set

Using scipy instead of scikit-learn

In [12]:
data2 = loadmat('data/ex7data1.mat')
data2.keys()
Out[12]:
dict_keys(['__header__', 'X', '__globals__', '__version__'])
In [13]:
X2 = data2['X']
print('X2:', X2.shape)
X2: (50, 2)
In [14]:
# Standardizing the data.
scaler = StandardScaler()
scaler.fit(X2)
Out[14]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [15]:
U, S, V = linalg.svd(scaler.transform(X2).T)
print(U)
print(S)
[[-0.70710678 -0.70710678]
 [-0.70710678  0.70710678]]
[ 9.3153915   3.63641048]
In [93]:
plt.scatter(X2[:,0], X2[:,1], s=30, edgecolors='b',facecolors='None', linewidth=1);
# setting aspect ratio to 'equal' in order to show orthogonality of principal components in the plot
plt.gca().set_aspect('equal')
plt.quiver(scaler.mean_[0], scaler.mean_[1], U[0,0], U[0,1], scale=S[1], color='r')
plt.quiver(scaler.mean_[0], scaler.mean_[1], U[1,0], U[1,1], scale=S[0], color='r');