import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
Left off Part 1 with with some examples of in-sample and test error estimates as a function of model complexity parameters like the number of nearest neighbors in KNN and $\lambda$ parameter in the case of the Lasso and Ridge regressions.
In many of the examples, we plotted y vs. y predicted as a visualizaiton inteneded to give us some intition in to where and how the model was working.
Making this idea more rigorous, we can use this as an effective meausre of model complexity.
Image(filename="Screen Shot 5.png", width=600)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn import cross_validation
from sklearn import datasets
from sklearn.cross_validation import cross_val_predict
from sklearn import linear_model
# boston house prices data set
boston = datasets.load_boston()
print boston.keys()
#print boston.feature_names
print boston.DESCR
print boston.target[:10]
# Set up many trials on same data set to understand
# test vs. sample error
# variance in estimates
n_trials, n_max_neighbors = 50, 100
err_train, err_test = np.zeros(n_max_neighbors-2), np.zeros(n_max_neighbors-2)
for i in range(n_trials):
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(boston.data, boston.target, test_size=0.3)
x_complexity, y_train, y_test = [], [], []
for k in range(n_max_neighbors, 2, -1):
clf = KNeighborsRegressor(n_neighbors=k)
clf.fit(X_train,Y_train)
x_complexity.append(k)
y_tmp = clf.score(X_train,Y_train)
y_train.append(y_tmp)
err_train[n_max_neighbors-k] += y_tmp
y_tmp = clf.score(X_test,Y_test)
y_test.append(y_tmp)
err_test[n_max_neighbors-k] += y_tmp
plt.plot(x_complexity, y_test, color="blue", alpha=0.2)
plt.plot(x_complexity, y_train, color="red", alpha=0.2)
plt.plot(x_complexity, err_test/n_trials, color="blue")
plt.plot(x_complexity, err_train/n_trials, color="red")
plt.xlabel("Number of Neighbors")
plt.ylabel("Fraction Correct")
plt.show()
y = boston.target
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation
clf = KNeighborsRegressor(n_neighbors=2)
predicted = cross_val_predict(clf, boston.data, y, cv=10)
fig,ax = plt.subplots()
ax.scatter(y, predicted)
print np.cov(y,predicted).sum()/(np.std(y)*np.std(y))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
y = boston.target
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation
k_vec = []
c_vec = []
n_alpha = 5
for i in range(n_alpha):
a = 5*(i+1)/float(n_alpha)
clf = Lasso(alpha=a)
predicted = cross_val_predict(clf, boston.data, y, cv=10)
k_vec.append(a)
c_vec.append(np.cov(y,predicted).sum()/(np.std(y)*np.std(y)))
fig,ax = plt.subplots()
ax.scatter(k_vec, c_vec)
ax.set_xlabel('alpha')
ax.set_ylabel('dof(y)')
plt.show()
y = boston.target
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation
k_vec = []
c_vec = []
n_alpha = 5
for k in range(20,2,-1):
clf = KNeighborsRegressor(n_neighbors=k)
predicted = cross_val_predict(clf, boston.data, y, cv=10)
k_vec.append(k)
c_vec.append(np.cov(y,predicted).sum()/(np.std(y)*np.std(y)))
fig,ax = plt.subplots()
ax.scatter(k_vec, c_vec)
ax.set_xlabel('K')
ax.set_ylabel('dof(y)')
plt.show()
From wikipedia: offers a relative estimate of the information lost when a given model is used to represent the process that generates the data. In doing so, it deals with the trade-off between the goodness of fit of the model and the complexity of the model.
Image(filename="Selection_008.png")
# Author: Olivier Grisel, Gael Varoquaux, Alexandre Gramfort
# License: BSD 3 clause
import time
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
rng = np.random.RandomState(42)
X = np.c_[X, rng.randn(X.shape[0], 14)] # add some bad features
# normalize data as done by Lars to allow for comparison
X /= np.sqrt(np.sum(X ** 2, axis=0))
##############################################################################
# LassoLarsIC: least angle regression with BIC/AIC criterion
model_bic = LassoLarsIC(criterion='bic')
t1 = time.time()
model_bic.fit(X, y)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(X, y)
alpha_aic_ = model_aic.alpha_
def plot_ic_criterion(model, name, color):
alpha_ = model.alpha_
alphas_ = model.alphas_
criterion_ = model.criterion_
plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
linewidth=3, label='%s criterion' % name)
plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
label='alpha: %s estimate' % name)
plt.xlabel('-log(alpha)')
plt.ylabel('criterion')
plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)'
% t_bic)
plt.show()
Image(filename="Selection_009.png")
Image(filename="Selection_007.png")
iris = datasets.load_iris()
X = iris.data[:, 0:2] # we only take the first two features for visualization
y = iris.target
#
clf = KNeighborsClassifier().fit(X, y)
#
from matplotlib.colors import ListedColormap
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#
h = .02 # step size in the mesh
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("knn classification")
plt.show()
# knn classifier
n_trials, n_max_neighbors = 30, 100
err_train, err_test = np.zeros(n_max_neighbors-2), np.zeros(n_max_neighbors-2)
for k in range(n_trials):
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.3)
x_complexity, y_train, y_test = [], [], []
for i in range(n_max_neighbors, 2, -1):
clf = KNeighborsClassifier(n_neighbors=i)
clf.fit(X_train,Y_train)
x_complexity.append(i)
y_tmp = clf.score(X_train,Y_train)
y_train.append(y_tmp)
err_train[n_max_neighbors-i] += y_tmp
y_tmp = clf.score(X_test,Y_test)
y_test.append(y_tmp)
err_test[n_max_neighbors-i] += y_tmp
plt.plot(x_complexity, y_test, color="blue", alpha=0.2)
plt.plot(x_complexity, y_train, color="red", alpha=0.2)
plt.plot(x_complexity, err_test/n_trials, color="blue")
plt.plot(x_complexity, err_train/n_trials, color="red")
plt.xlabel("Number of Neighbors")
plt.ylabel("Fraction Correct")
plt.show()
Image(filename="Selection_012.png")
Image(filename="Selection_011.png")
Image(filename="10KWV.png")
You choose the points, then the adversary chooses the labeling. Finally you should be able to produce a hypothesis that correctly classifies that labeling of those points. If you are able to succeed for all labelings of the adversary, we say that the VC dimension is at least the number of points you were able to choose. – Srivatsan Jan 5 '12
From Wikipedia:
In statistical learning theory, or sometimes computational learning theory, the VC dimension (for Vapnik–Chervonenkis dimension) is a measure of the capacity (complexity, expressive power, richness, or flexibility) of a statistical classification algorithm, defined as the cardinality of the largest set of points that the algorithm can shatter. It is a core concept in Vapnik–Chervonenkis theory, and was originally defined by Vladimir Vapnik and Alexey Chervonenkis.
Three kinds of models:
It is difficult to be prescriptive with a model that doesn't predict well. Therefore, prediction and understanding errors in prediction are central value to model building.
When learning from data examples: