from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.datasets import load_digits
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)
from sklearn.svm import SVC
cross_val_score(SVC(C=1), X_train, y_train, cv=3)
cross_val_score(SVC(C=10), X_train, y_train, cv=3, scoring="f1")
Let's go to a binary task for a moment (even vs uneven)
cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3)
cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring="average_precision")
cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring="roc_auc")
There are other ways to do cross-valiation
from sklearn.cross_validation import ShuffleSplit
cross_val_score(SVC(C=10), X_train, y_train, cv=ShuffleSplit(len(X_train), 10, test_size=.4))
Grid-Search with build-in cross validation
from sklearn.grid_search import GridSearchCV
Define parameter grid:
import numpy as np
param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma' : 10. ** np.arange(-3, 3)}
print(param_grid)
grid_search = GridSearchCV(SVC(), param_grid, verbose=3, n_jobs=3)
A GridSearchCV object behaves just like a normal classifier.
grid_search.fit(X_train, y_train)
# We extract just the scores
%matplotlib inline
import matplotlib.pyplot as plt
scores = [x[1] for x in grid_search.grid_scores_]
scores = np.array(scores).reshape(6, 6)
plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(6), param_grid['gamma'])
plt.yticks(np.arange(6), param_grid['C'])
grid_search.best_params_
grid_search.predict(X_test)
grid_search.score(X_test, y_test)
from sklearn.preprocessing import StandardScaler
Same interface as always.
scaler = StandardScaler()
scaler.fit(X_train)
scaler.transform(X_train).mean(axis=0)
scaler.transform(X_train).std(axis=0)
For cross-validation, we need to estimate mean and standard deviation separately for each fold. To do that, we build a pipeline.
from sklearn.pipeline import Pipeline
pipeline = Pipeline([("scaler", scaler), ("svm", SVC())])
pipeline.fit(X_train, y_train)
pipeline.predict(X_train)
cross_val_score(pipeline, X_train, y_train)
So, yeah, don't forget the preprocessing.
param_grid_pipeline = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}
grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid_pipeline, verbose=3)
grid_pipeline.fit(X_train, y_train)
# We extract just the scores
scores = [x[1] for x in grid_pipeline.grid_scores_]
scores = np.array(scores).reshape(6, 6)
plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(6), param_grid['gamma'])
plt.yticks(np.arange(6), param_grid['C'])
grid_pipeline.score(X_test, y_test)
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import expon
plt.hist([expon.rvs() for x in xrange(1000)])
params = {'svm__C': expon(), 'svm__gamma': expon()}
rs = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=50, verbose=3)
rs.fit(X_train, y_train)
rs.best_params_
rs.best_score_
scores, Cs, gammas = zip(*[(score.mean_validation_score, score.parameters['svm__C'], score.parameters['svm__gamma']) for score in rs.grid_scores_])
plt.scatter(Cs, gammas, s=40, c=scores)
plt.xlabel("C")
plt.ylabel("gamma")
X = np.random.normal(size=(50, 40))
y = np.random.randint(2, size=50)
X
y
from sklearn.feature_selection import SelectKBest
f_selection = SelectKBest(k=3).fit(X, y)
X_good = f_selection.transform(X)
from sklearn.svm import LinearSVC
grid = GridSearchCV(LinearSVC(), param_grid={'C': 10. ** np.arange(-3 ,3)})
grid.fit(X_good, y)
grid.best_params_
grid.best_score_