# imports for the tutorial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook
K-means clustering is one of the simplest and most popular unsupervised machine learning algorithms.
The objective of K-means: group similar data points together and discover underlying patterns. To achieve this objective, K-means looks for a fixed number (k) of clusters in a dataset.
A cluster refers to a collection of data points aggregated together because of certain similarities.
Explanations and example from Understanding K-Means Clustering in Machine Learning - towardsdatascience.com
# k-means example
from sklearn.cluster import KMeans
def generate_random_data():
# generate random data
X = -2 * np.random.rand(100,2)
X[50:100, :] = 1 + 2 * np.random.rand(50,2)
return X
def plot_data(X):
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1,1,1)
ax.scatter(X[ : , 0], X[ :, 1], s = 50, c = 'b')
ax.grid()
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_title("some random data")
X = generate_random_data()
plot_data(X)
def run_plot_kmeans(X):
k_mean = KMeans(n_clusters=2)
k_mean.fit(X)
print(k_mean)
# plot the centroids
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1,1,1)
ax.scatter(X[ : , 0], X[ :, 1], s = 50, c = 'b', label="data")
ax.scatter(k_mean.cluster_centers_[0][0], k_mean.cluster_centers_[0][1], s=200, c='g', marker='s', label="center 1")
ax.scatter(k_mean.cluster_centers_[1][0], k_mean.cluster_centers_[1][1], s=200, c='r', marker='s', label="center 2")
ax.grid()
ax.legend()
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_title("some random data")
run_plot_kmeans(X)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
# Generate some data
from sklearn.datasets.samples_generator import make_blobs
from sklearn.mixture import GaussianMixture
def generate_blobs():
X, y_true = make_blobs(n_samples=400, centers=4,
cluster_std=0.60, random_state=0)
X = X[:, ::-1] # flip axes for better plotting
return X
def plot_blobs(X):
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1,1,1)
ax.scatter(X[:, 0], X[:, 1], s=40, cmap='viridis')
ax.grid()
X = generate_blobs()
plot_blobs(X)
# some helper functions for plotting
from matplotlib.patches import Ellipse
def draw_ellipse(position, covariance, ax=None, **kwargs):
"""Draw an ellipse with a given position and covariance"""
ax = ax or plt.gca()
# Convert covariance to principal axes
if covariance.shape == (2, 2):
U, s, Vt = np.linalg.svd(covariance)
angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
width, height = 2 * np.sqrt(s)
else:
angle = 0
width, height = 2 * np.sqrt(covariance)
# Draw the Ellipse
for nsig in range(1, 4):
ax.add_patch(Ellipse(position, nsig * width, nsig * height,
angle, **kwargs))
def plot_gmm(gmm, X, label=True, ax=None):
ax = ax or plt.gca()
labels = gmm.fit(X).predict(X)
if label:
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
else:
ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
ax.axis('equal')
w_factor = 0.2 / gmm.weights_.max()
for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
draw_ellipse(pos, covar, alpha=w * w_factor)
def run_plot_gmm(X):
gmm = GaussianMixture(n_components=4, random_state=42)
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1,1,1)
ax.grid()
plot_gmm(gmm, X, ax=ax)
run_plot_gmm(X)
From a computation viewpoint:
The EM Algorithm requires an initial guess $\Theta^0$ for the parameters.
Each iteration of E-step and M-step is guaranteed to increase the log-likelihood of the observed data, $\log Pr(X|\Theta)$ until a local maximum is reached.
$$ \mathcal{L}(X|\theta) \geq \sum_{i=1}^n \mathbb{E}_{z \sim q_i}\log\big[\frac{p(x_i, z|\theta)}{q_i(z)}\big] =\sum_{i=1}^n \mathbb{E}_{z \sim q_i}\big[ \log p(x_i, z | \theta) - \log q_i(z) \big] $$ $$ = \sum_{i=1}^n\big[ \mathbb{E}_{z \sim q_i}\log p(x_i, z | \theta) - \mathbb{E}_{z \sim q_i}\log q_i(z) \big] = \sum_{i=1}^n\big[ \mathbb{E}_{z \sim q_i}\log p(x_i, z | \theta) +\mathcal{H}(q_i(z)) \big] \geq Q(\Theta;\Theta^t)$$
* $\mathcal{H}(q_i(z))$ - the **entropy** of $q_i(z)$, which **does not depend on $\theta$**. Thus, taking the $argmax_{\theta}$: $$ argmax_{\theta} \sum_{i=1}^n\big[ \mathbb{E}_{z \sim q_i}\log p(x_i, z | \theta) +\mathcal{H}(q_i(z)) \big] = argmax_{\theta} Q(\Theta;\Theta^t) $$
Estimate $Z$ given $X$ and current $\Theta$ (the posterior):
$$Pr(z_i=j|x_i, \Theta)= r_{ij} = \frac{Pr(x_i,z_i=j|\Theta)}{Pr(x_i|\Theta)} = \frac{Pr(x_i,z_i=j|\Theta)}{\sum_{j'}Pr(x_i, z_i=j'|\Theta)} = \frac{Pr(x_i|z_i=j, \Theta) \cdot Pr(z_i=j|\Theta)}{\sum_{j'}Pr(x_i|z_i=j', \Theta) \cdot Pr(z_i=j'|\Theta)} $$$$ \sum_i \sum_{j=1}^k r_{ij}[\log Pr(z_i=j|\Theta) + \log Pr(x_i|z_i = j, \Theta)]$$
\Theta)} = \frac{\alpha_j Pr(x_i|\mu_j, \Sigma_j)}{\sum_{j'} \alpha_{j'}Pr(x_i|\mu_{j'}, \Sigma_{j'})} = \frac{\alpha_j |\Sigma_j|^{-\frac{1}{2}} e^{-\frac{1}{2}(x_i-\mu_j)^T\Sigma_j^{-1}(x_i-\mu_j)}}{\sum_{j'} \alpha_{j'} |\Sigma_j'|^{-\frac{1}{2}} e^{-\frac{1}{2}(x_i-\mu_{j'})^T\Sigma_{j'}^{-1}(x_i-\mu_{j'})}} $$
$$ \sum_i \sum_{j=1}^k r_{ij}[\log Pr(z_i=j|\Theta) + \log Pr(x_i|z_i = j, \Theta)] = $$ $$ \sum_i \sum_{j=1}^k r_{ij}[ \log \alpha_j + \log Pr(x_i|\mu_j, \Sigma_j)] = $$ $$ \sum_i \sum_{j=1}^k r_{ij} \log \alpha_j -\frac{1}{2} \sum_{j=1}^k \log |\Sigma_j| \sum_i r_{i,j} -\frac{1}{2} \sum_i \sum_{j=1}^k r_{ij}(x_i-\mu_j)^T \Sigma_j^{-1} (x_i-\mu_j) + Const $$
\Theta)} = \frac{\alpha_j Pr(x_i|p_j)}{\sum_{j'} \alpha_{j'}Pr(x_i|p_{j'})} = \frac{\alpha_j p_j^{x_i}(1-p_j)^{1-x_i}}{\sum_{j'} \alpha_{j'}p_{j'}^{x_i} (1-p_{j'})^{1-x_i}} $$
$$ \sum_i \sum_{j=1}^k r_{ij}[\log Pr(z_i=j|\Theta) + \log Pr(x_i|z_i = j, \Theta)] = $$ $$ \sum_i \sum_{j=1}^k r_{ij}[ \log \alpha_j + \log Pr(x_i|p_j)] = $$ $$ \sum_i \sum_{j=1}^k r_{ij} \log \alpha_j +\sum_i \sum_{j=1}^k r_{ij} \log \big( p_j^{x_i} (1-p_j)^{1-x_i} \big) = $$ $$ \sum_i \sum_{j=1}^k r_{ij} \log \alpha_j + \sum_i \sum_{j=1}^k r_{ij}x_i\log(p_j) +r_{ij}(1-x_i)\log(1-p_j)$$
To maximize $Q(\Theta;\Theta^t)$ with respect to $p_j$, we set the gradient to zero.
Derive: $$ \frac{\partial}{\partial p_j} Q(\Theta; \Theta^t) = \sum_{i=1}^n r_{ij} \big( \frac{x_i}{p_j} - \frac{1-x_i}{1-p_j} \big) = 0 \rightarrow \hat{p}_j = \frac{\sum_{i=1}^n r_{ij}x_i}{\sum_{i=1}^n r_{ij}}$$
To maximize $Q(\Theta;\Theta^t)$ with respect to $\alpha_j$:
To sum up:
$$ \hat{p}_j = \frac{\sum_{i=1}^n r_{ij}x_i}{\sum_{i=1}^n r_{ij}} $$
$$ \hat{\alpha}_j = \frac{\sum_{i=1}^n r_{ij}}{n} $$
If all $r_{ij} = \{0,1\}$, that is, deterministic, then:
\Theta)} = \frac{\alpha_j Pr(x_i|\mu_j, \Sigma_j)}{\sum_{j'} \alpha_{j'}Pr(x_i|\mu_{j'}, \Sigma_{j'})} = \frac{\alpha_j e^{-\frac{1}{2}(x_i-\mu_j)^T\Sigma_j^{-1}(x_i-\mu_j)}}{\sum_{j'} \alpha_{j'} e^{-\frac{1}{2}(x_i-\mu_{j'})^T\Sigma_{j'}^{-1}(x_i-\mu_{j'})}} $$
\Theta)} = \frac{\alpha_j e^{-\frac{1}{2 \epsilon}||(x_i-\mu_j)||2^2)}}{\sum{j'} \alpha_{j'} e^{-\frac{1}{2 \epsilon}||(x_i-\mu_{j'})||_2^2}} $$
EM Motivation:
EM Lecture - K-Means + GMMs - VERY RECOMMENDED - Stanford - Machine Learning (12) - Andrew Ng