#!/usr/bin/env python # coding: utf-8 # ## Unsupvervised Learning # # Clustering is a class of unsupervised learning methods that associates observations according to some specified measure of similarity (e.g. Euclidean distance). # ## K-means Algorithm # # The K-means clustering algorithm associates each point $x_i$ in a set of input points $\{x_1, x_2, \ldots, x_m\}$ to $K$ clusters. Each cluster is specified by a **centroid** that is the average location of all the points in the cluster. The algorithm proceeds iteratively from arbitrary centroid locations, updating the membership of each point according to minimum distance, then updating the centroid location based on the new cluster membership. # # In this sense, it is similar to the expectation maximization (EM) algorithm. Recall that in EM we iteratively assigned labels to observations, according to which mixture component they were most likely to have been derived from. K-means is simpler, in that we just use the minimum distance to assign membership. # # The algorithm will have converged when the assignment of points to centroids does not change with each iteration. # ### Algorithm # # 1. Initialize cluster centroids: # # $$\mu^{(0)}_1, \ldots, \mu^{(0)}_k \in \mathbb{R}^n$$ # # 2. Iterate until converged: # # a. Set $c_i = \text{argmin}_j || x_i - \mu_j^{(s)} ||$ # # b. Update centroids: # # $$\mu_j^{(s+1)} = \frac{\sum_{i=1}^m I[c_i = j] x_i}{\sum_{i=1}^m I[c_i = j]}$$ # The K-means algorithm is simply a Gaussian mixture model with two restrictions: # # 1. the covariance matrix is spherical: # # $$\Sigma_k = \sigma I_D$$ # # 2. the mixture weights are fixed: # # $$\pi_k = \frac{1}{K}$$ # # Hence, we are only interested in locating the appropriate centroid of the clusters. This serves to speed computation. # We can define the distortion function: # # $$J(c,\mu) = \sum_{i]1}^m ||x_i - \mu_{c_i}||$$ # # which gets smaller at every iteration. So, k-means is coordinate ascent on $J(c,\mu)$ # ### Choosing $k$ # # To check whether a chosen $k$ is reasonable, one approach is to compare the distances between the centroids to the mean distance bewween each data point and their assigned centroid. A good fit involves relatively large inter-centroid distances. # # The appropriate value for k (the number of clusters) may depend on the goals of the analysis, or it may be chosen algorithmically, using an optimization procedure. # ## Example: clustering random points # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns; sns.set_context('notebook') import numpy as np import matplotlib.pyplot as plt x, y = np.random.uniform(0, 10, 50).reshape(2, 25) plt.scatter(x, y) # Let's start with $k=4$, arbitrarily assigned: # In[2]: centroids = (3, 3), (3, 7), (7, 3), (7, 7) # In[3]: np.transpose(centroids) # In[4]: plt.scatter(x, y) plt.scatter(*np.transpose(centroids), c='r', marker='+', s=100) # We can use the function `cdist` from SciPy to calculate the distances from each point to each centroid. # In[5]: from scipy.spatial.distance import cdist distances = cdist(centroids, list(zip(x,y))) distances.shape # We can make the initial assignment to centroids by picking the minimum distance. # In[6]: labels = distances.argmin(axis=0) labels # In[7]: plt.scatter(x, y, c=np.array(list('rgbc'))[labels]) plt.scatter(*np.transpose(centroids), c='r', marker='+', s=100) # Now we can re-assign the centroid locations based on the means of the current members' locations. # In[8]: new_centroids = [(x[labels==i].mean(), y[labels==i].mean()) for i in range(len(centroids))] # In[9]: plt.scatter(x, y, c=np.array(list('rgbc'))[labels]) plt.scatter(*np.transpose(new_centroids), c='r', marker='+', s=100) # So, we simply iterate these steps until convergence. # In[10]: centroids = new_centroids iterations = 20 for _ in range(iterations): distances = cdist(centroids, list(zip(x,y))) labels = distances.argmin(axis=0) centroids = [(x[labels==i].mean(), y[labels==i].mean()) for i in range(len(centroids))] # In[11]: plt.scatter(x, y, c=np.array(list('rgbc'))[labels]) plt.scatter(*np.transpose(centroids), c='r', marker='+', s=100) # ## k-means using `scikit-learn` # # The `scikit-learn` package includes a `KMeans` class for flexibly fitting K-means models. It includes additional features, such as initialization options and the ability to set the convergence tolerance. # In[12]: from sklearn.cluster import KMeans from numpy.random import RandomState rng = RandomState(1) # Instantiate model kmeans = KMeans(n_clusters=4, random_state=rng) # Fit model kmeans.fit(np.transpose((x,y))) # After fitting, we can retrieve the labels and cluster centers. # In[13]: kmeans.labels_ # In[14]: kmeans.cluster_centers_ # The resulting plot should look very similar to the one we fit by hand. # In[15]: plt.scatter(x, y, c=np.array(list('rgbc'))[kmeans.labels_]) plt.scatter(*kmeans.cluster_centers_.T, c='r', marker='+', s=100) # ## Example: Microbiome data # # The `microbiome.csv` dataset contains counts of various microbe taxa extraced from either tissue or stool samples of NICU infants. We might be interested in seeing if samples cluster into groups approximately corresponding to location (tissue or stool) based on the counts of each bacterial taxon. # In[16]: import pandas as pd microbiome = pd.read_csv("../data/microbiome.csv") # First, we need to transpose the data so that it can be used with `scikit-learn`'s interface. Fortunately, Pandas makes this relatively painless. The data are stored in *long* format: # In[17]: microbiome.head() # For this analysis, we need the features (*i.e.* taxa) in columns, with a row for each sample. First we drop the `Group` column, then pivot the `Taxon` column into a column index. # In[18]: get_ipython().run_line_magic('pinfo', 'microbiome_pivoted.pivot') # In[19]: microbiome_pivoted = microbiome.drop('Group', axis=1).pivot(index='Patient', columns='Taxon').stack(level=0).reset_index() microbiome_pivoted.columns.name = None microbiome_pivoted.head() # Then we drop the unused column and change the location variable from `str` type to `int`. # In[20]: microbiome_data = microbiome_pivoted.drop('Patient', axis=1).rename(columns={'level_1':'Location'} ).replace({'Tissue': 0 , 'Stool':1}) y = microbiome_data.values[:, 0] X = microbiome_data.values[:, 1:] # In[21]: microbiome_data.head() # To simplify the analysis, and aid visualization, we will again perform a PCA to isolate the majority of the variation into two principal components. # In[22]: from sklearn.decomposition import PCA from itertools import cycle pca = PCA(n_components=2, whiten=True).fit(X) X_pca = pca.transform(X) def plot_2D(data, target, target_names, pca): colors = cycle('rgbcmykw') target_ids = range(len(target_names)) plt.figure() for i, c, label in zip(target_ids, colors, target_names): plt.scatter(data[target == i, 0], data[target == i, 1], c=c, label=label) var_explained = pca.explained_variance_ratio_ * 100 plt.xlabel('First Component: {0:.1f}%'.format(var_explained[0])) plt.ylabel('Second Component: {0:.1f}%'.format(var_explained[1])) plt.legend() # In[23]: plot_2D(X_pca, y, ['Tissue', 'Stool'], pca) # We can now create a `KMeans` object with `k=2`, and fit the data with it. # In[24]: km_microbiome = KMeans(n_clusters=2, random_state=rng) km_microbiome.fit(X_pca) # From this, we can extract the cluster centroids (in the `cluster_center_` attribute) and the group labels (in `labels_`) in order to generate a plot of the classification result. # In[25]: np.round(km_microbiome.cluster_centers_, decimals=2) # In[26]: km_microbiome.labels_ # In[27]: plot_2D(X_pca, km_microbiome.labels_, ["c1", "c2"], pca) # `scikit-learn` includes a suite of well-known clustering algorithms. Like most unsupervised learning models in the scikit, they expect the data to be clustered to have the shape `(n_samples, n_features)`: # # - `sklearn.cluster.KMeans` # : The simplest, yet effective clustering algorithm. Needs to be provided with the # number of clusters in advance, and assumes that the data is normalized as input # (but use a PCA model as preprocessor). # - `sklearn.cluster.MeanShift` # : Can find better looking clusters than KMeans but is not scalable to high number of samples. # - `sklearn.cluster.DBSCAN` # : Can detect irregularly shaped clusters based on density, i.e. sparse regions in # the input space are likely to become inter-cluster boundaries. Can also detect # outliers (samples that are not part of a cluster). # ## Exercise: NEC # # If all the odd-numbered patients are healthy controls and the even-numbered patients have necrotizing enterocolitis (NEC), see if either the tissue or stool samples cluster according to group status. # In[28]: ## Write answer here # ## Exercise: clustering baseball statistics # # We can use clustering to try to find interesting groupings among sets of baseball statistics. Load the baseball dataset and run a clustering algorithm on the set of three statistics: # # * hit rate: hits / at bats # * strikeout rate: strikeouts / at bats # * walk rate: bases on balls /at bats # # You should probably set a minimum number of at bats to qualify for the analysis, since there are pitchers that get only a handful of at bats each year. # # Since we are clustering in 3 dimensions, you can visualize the output as a series of pairwise plots. # In[29]: import pandas as pd baseball = pd.read_csv("../data/baseball.csv", index_col=0) baseball.head() # In[30]: ## Write answer here # ## DP-Means # # The major weakness of the k-means approach to clustering is that the number of clusters needs to be specified at the outset. However, there is usually uncertainty with respect to the appropriate number of clusters for a given dataset. A flexible alternative to k-means that allows for an unknown number of clusters involves using a Bayesian non-parametric mixture model instead (Kulis and Jordan 2011). In particular, a Dirichlet process (DP) mixture model, which we have seen in a previous lecture, probabilistically assigns observations to clusters, using a stick-breaking algorithm. # # Recall the definition of a finite mixture model: # # \\[f(y) = \sum_{h=1}^{k} \pi_h \mathcal{K}(y|\theta_h)\\] # # where \\(k\\) is the number of mixture components, \\(\pi_h\\) is the mixing coefficient for component \\(h\\), and \\(K\\) specifies the mixing components (*e.g.* a Gaussian distribution), which has parameters \\(\theta_h\\) for each component. # # A DP mixture extends this by placing a Dirichlet prior of dimension \\(k\\) on the mixing coefficients. The distribution over the group indicators can then be specified as a categorical distribution: # # \\[\begin{aligned} # \mathbf{\pi} &\sim \text{Dirichlet}(k, \pi_0) \\ # z_1,\ldots,z_n &\sim \text{Categorical}(\mathbf{\pi}) \\ # \end{aligned}\\] # # We might then specify the observations as being a mixture of Gaussians, whose means are drawn from an appropriate prior distribution \\(P\\): # # \\[\begin{aligned} # \theta_1,\ldots,\theta_k &\sim P \\ # y_1,\ldots,y_n &\sim N(\theta_{z[i]}, \sigma I) # \end{aligned}\\] # In[31]: from sklearn.decomposition import PCA from sklearn import datasets iris = datasets.load_iris() pca = PCA(n_components=2, whiten=True).fit(iris.data) X_pca = pca.transform(iris.data) # In[32]: X_pca.shape # In[33]: import pymc3 as pm import theano.tensor as tt K = 10 N = X_pca.shape[0] with pm.Model() as dp_mixture: α = pm.Gamma('α', 1., 1.) β = pm.Beta('β', 1., α, shape=K) w = pm.Deterministic('w', β * tt.concatenate([[1], tt.extra_ops.cumprod(1 - β)[:-1]])) component = pm.Categorical('component', w, shape=N) μ = tt.stack([pm.Normal('μ_%i' % i, mu=tt.zeros(2), sd=1000, shape=2) for i in range(K)]) σ = pm.Uniform('σ', 0, 5, shape=(K, 2)) sigs = σ[component] mus = μ[component] obs = tt.stack([pm.MvNormal('obs_%i' % i, mus[i], cov=tt.eye(2)*(sigs[i]**2), observed=X_pca[i]) for i in range(N)]) # In[34]: with dp_mixture: step1 = pm.Metropolis(vars=[α, β, σ, μ]) step2 = pm.CategoricalGibbsMetropolis(vars=[component]) trace = pm.sample(10000, step=[step1, step2]) # In[40]: pm.traceplot(trace, varnames=['α']) # In[43]: pm.forestplot(trace, varnames=['τ']) # ### DP-means algorithm # # 1. Initialize number of clusters to 1, and assign all observations to that cluster. Calculate cluster mean to be global mean. # 2. Specify cluster penalty parameter \\(\lambda\\) # 3. Initialize cluster indicators: \\(z_1 = z_2 = \ldots, = z_n = 1 \\) # 4. Repeat until convergence: # # + For each data point \\(x_i\\): # # + compute distance from means $d_{ic} = ||x_i - \mu_c||^2$ for $c=1,\ldots,k$ # + If $\min_c(d_{ic}) > \lambda$ set $k = k+1$, $z_i = k$, $\mu_k = x_i$ # + Otherwise set $z_i = \text{argmin}_c(d_{ic})$ # # + Generate clusters $l_1, \ldots, l_k$ from $z_1,\ldots,z_n$ # # + Recompute cluster means: $\mu_j = \frac{1}{|l_j|} \sum_{x \in l_j} x$ # In[42]: def dp_means(x, lam, max_iter=100, tol=1e-5, metric='euclidean'): x = np.array(x) n = x.shape[0] k = 1 # Initialize cluster indicators z = np.ones(n, int) # Initialize with single cluster of all observations mu = [x.mean(0)] # Initialize variables converged = False iteration = 0 ss = np.inf # Iterate until converged or maxed out while (not converged) and (iteration < max_iter): # Calculate distances for all points d = cdist(x, np.array(mu), metric=metric) for i in range(n): if np.min(d[i]) > lam: # Create new group k += 1 z[i] = k - 1 mu += [x[i]] else: # Assign to closest group z[i] = np.argmin(d[i]) for j in range(k): # Recalculate centroids if (z==j).sum(): indices = np.where(z==j)[0] mu[j] = np.mean(x[indices], 0) ss_old = ss # Calcuate sum of squared distances to use as convergence criterion ss = np.sum([[(x[i,j] - mu[z[i]][j])**2 for j in range(2)] for i in range(n)]) ss_diff = ss_old - ss if ss_diff < tol: converged = True iteration += 1 return(dict(centers=np.array(mu), z=z, k=k, iterations=iteration, converged=converged, ss=ss)) # In[43]: x,y = X_pca.T fig, axes = plt.subplots(1, 3, figsize=(16, 4)) for i,c in enumerate([2, 3, 4]): dpm = dp_means(X_pca, c, metric='seuclidean') axes[i].scatter(x, y, c=dpm['z']) axes[i].scatter(*dpm['centers'].T, c='r', marker='+', s=100) axes[i].set_title(r'$\lambda$={0}, k={1}'.format(c, dpm['k'])) # In[44]: clusters = [] lambdas = np.linspace(2, 4) for c in lambdas: dpm = dp_means(X_pca, c, metric='euclidean') clusters.append(len(dpm['centers'])) # In[45]: plt.plot(lambdas, clusters) # ## References # # Kulis B and Jordan MI. Revisiting k-means: New Algorithms via Bayesian Nonparametrics. arXiv preprint [arXiv:11110352](http://arxiv.org/abs/1111.0352) 2011.