See our notes on unsupervised learning, K-means, DBSCAN, and clustering validation.
For documentation of various clustering methods in scikit-learn, see http://scikit-learn.org/stable/modules/clustering.html
This code was based on the example at http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html which has the following comments:
Code source: Gaël Varoquaux
Modified for documentation by Jaques Grobler
License: BSD 3 clause
%matplotlib inline
import numpy
import matplotlib.pyplot
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
import pandas
Before you go on, if you haven't used the iris dataset in a previous assignment, make sure you understand it. Modify the cell below to examine different parts of the dataset that are contained in the iris dictionary object.
What are the features? What are we trying to classify?
iris = datasets.load_iris()
iris.keys()
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
iris_df = pandas.DataFrame(iris.data)
iris_df.columns = iris.feature_names
iris_df.head()
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
# We can only plot 3 of the 4 iris features, since we only see in 3D.
# These are the ones the example code picked
X_FEATURE = 'petal width (cm)'
Y_FEATURE = 'sepal length (cm)'
Z_FEATURE = 'petal length (cm)'
# set some bounds for the figures that will display the plots of clusterings with various
# hyperparameter settings
# this allows for NUM_COLS * NUM_ROWS plots in the figure
NUM_COLS = 4
NUM_ROWS = 6
FIG_WIDTH = 4 * NUM_COLS
FIG_HEIGHT = 3 * NUM_ROWS
def add_plot(figure, subplot_num, subplot_name, data, labels):
'''Create a new subplot in the figure.'''
# create a new subplot
axis = figure.add_subplot(NUM_ROWS, NUM_COLS, subplot_num, projection='3d',
elev=48, azim=134)
# Plot three of the four features on the graph, and set the color according to the labels
axis.scatter(data[X_FEATURE], data[Y_FEATURE], data[Z_FEATURE], c=labels)
# get rid of the tick numbers. Otherwise, they all overlap and it looks horrible
for axis_obj in [axis.w_xaxis, axis.w_yaxis, axis.w_zaxis]:
axis_obj.set_ticklabels([])
# label the subplot
axis.title.set_text(subplot_name)
This is the correct labeling, based on the targets.
# start a new figure to hold all of the subplots
truth_figure = matplotlib.pyplot.figure(figsize=(FIG_WIDTH, FIG_HEIGHT))
# Plot the ground truth
add_plot(truth_figure, 1, "Ground Truth", iris_df, iris.target)
Now let's see how k-means clusters the iris dataset, with various different numbers of clusters
MAX_CLUSTERS = 10
# start a new figure to hold all of the subplots
kmeans_figure = matplotlib.pyplot.figure(figsize=(FIG_WIDTH, FIG_HEIGHT))
# Plot the ground truth
add_plot(kmeans_figure, 1, "Ground Truth", iris_df, iris.target)
plot_num = 2
for num_clusters in range(2, MAX_CLUSTERS + 1):
# train the model
model = KMeans(n_clusters=num_clusters)
model.fit(iris_df)
# get the predictions of which cluster each input is in
labels = model.labels_
# plot this clustering
title = '{} Clusters'.format(num_clusters)
add_plot(kmeans_figure, plot_num, title, iris_df, labels.astype(numpy.float))
plot_num += 1
# your code here