Choosing the right complexity for a model

In [ ]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.cross_validation import  train_test_split


iris = load_iris()
X = iris.data
y = iris.target


# dataset for decision function visualization
X_2d = X[:, :2]
X_2d = X_2d[y > 0]
y_2d = y[y > 0]
y_2d -= 1

X_train, X_test, y_train, y_test = train_test_split(X_2d, y_2d)
In [ ]:
%matplotlib inline
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
In [ ]:
def show_decision_function(clf, ax):
    xx, yy = np.meshgrid(np.linspace(4.5, 8, 200), np.linspace(1.5, 4.0, 200))
    try:
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    except AttributeError:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]

    Z = Z.reshape(xx.shape)
    ax.pcolormesh(xx, yy, Z, cmap=plt.cm.jet)
    ax.set_xlim(4.5, 8)
    ax.set_ylim(1.5, 4.0)
    ax.set_xticks(())
    ax.set_yticks(())
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
In [ ]:
from sklearn.svm import SVC

training_scores = []
test_scores = []
fig, axes = plt.subplots(2, 3, figsize=(20, 10))
Cs = [0.01, 0.1, 1, 10, 100, 1000]

for C, ax in zip(Cs, axes.ravel()):
    clf = SVC(gamma=10, C=C)
    clf.fit(X_train, y_train)
    training_scores.append(clf.score(X_train, y_train))
    test_scores.append(clf.score(X_test, y_test))
    show_decision_function(clf, ax)
In [ ]:
plt.figure(figsize=(20, 10))
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.legend(loc="best")
plt.xticks(range(6), Cs)

Tasks

  1. Play with the n_neighbors parameter of KNeighborsClassifier on the digits dataset. Compare training set and test set performance to see how it is related to complexity.