In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

K-means Clustering

In [2]:
import scipy.io
In [3]:
data1 = scipy.io.loadmat('ex7data2.mat')
In [4]:
plt.figure(figsize=(8, 6))
plt.scatter(data1['X'][:, 0], data1['X'][:, 1])
plt.show()
In [5]:
from sklearn.cluster import KMeans
In [6]:
clf = KMeans(n_clusters=3)
clf.fit(data1['X'])

clf.cluster_centers_
Out[6]:
array([[ 6.03366736,  3.00052511],
       [ 1.95399466,  5.02557006],
       [ 3.04367119,  1.01541041]])
In [7]:
plt.figure(figsize=(8, 6))
plt.scatter(data1['X'][:, 0], data1['X'][:, 1], c=clf.labels_, alpha=0.3);
plt.scatter(clf.cluster_centers_[:, 0], clf.cluster_centers_[:, 1], c=[1, 2, 3], marker='x', s=300)
plt.show()

Image compression with K-means

In [8]:
import matplotlib.image as mpimg
In [9]:
bird = mpimg.imread('bird_small.png')
In [10]:
plt.imshow(bird)
plt.show()
In [11]:
w, h, d = bird.shape
data = bird.reshape(w * h, d)
In [12]:
clf2 = KMeans(n_clusters=16)
clf2.fit(data)

print clf2.cluster_centers_
[[ 0.0834935   0.09036529  0.08008345]
 [ 0.75141674  0.59885216  0.33388374]
 [ 0.97696686  0.9449037   0.81487768]
 [ 0.44822041  0.38334575  0.35380492]
 [ 0.82769043  0.74617228  0.73630365]
 [ 0.26959873  0.25164143  0.25290671]
 [ 0.39013966  0.46148964  0.65079703]
 [ 0.70019541  0.62881643  0.55957062]
 [ 0.86640783  0.71524649  0.45322981]
 [ 0.63555295  0.46031496  0.20289512]
 [ 0.15880958  0.16137279  0.15292723]
 [ 0.43953884  0.3145142   0.19034396]
 [ 0.91669935  0.57227943  0.253317  ]
 [ 0.96278682  0.85616545  0.62635042]
 [ 0.57213287  0.72428973  0.86969188]
 [ 0.58509721  0.49565052  0.44318836]]
In [13]:
compressed = clf2.cluster_centers_[clf2.labels_].reshape(w, h, d)
In [14]:
plt.imshow(compressed)
plt.show()

Principal Component Analysis

In [15]:
data2 = scipy.io.loadmat('ex7data1.mat')
In [16]:
plt.figure(figsize=(8, 6))
plt.title('Example Dataset 1')
plt.scatter(data2['X'][:, 0], data2['X'][:, 1])
plt.show()
In [17]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
In [18]:
clf3 = Pipeline([('zscore', StandardScaler()),
                ('pca', PCA(n_components=1))])
In [19]:
X = clf3.fit_transform(data2['X'])
print clf3.named_steps['pca'].components_
[[ 0.70710678  0.70710678]]
In [20]:
X_inverse = clf3.named_steps['zscore'].inverse_transform(clf3.named_steps['pca'].components_ * X)
In [21]:
plt.figure(figsize=(8, 6))
plt.title('Example Dataset 1')
plt.scatter(data2['X'][:, 0], data2['X'][:, 1])
plt.scatter(X_inverse[:, 0], X_inverse[:, 1], c='r', marker='o')
plt.show()
In [ ]: