%pylab inline import pylab as pl import numpy as np from sklearn.datasets import fetch_lfw_people lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) X = lfw_people.data y = lfw_people.target names = lfw_people.target_names n_samples, n_features = X.shape _, h, w = lfw_people.images.shape n_classes = len(names) print("n_samples: {}".format(n_samples)) print("n_features: {}".format(n_features)) print("n_classes: {}".format(n_classes)) def plot_gallery(images, titles, h, w, n_row=3, n_col=6): """Helper function to plot a gallery of portraits""" pl.figure(figsize=(1.7 * n_col, 2.3 * n_row)) pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) for i in range(n_row * n_col): pl.subplot(n_row, n_col, i + 1) pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray) pl.title(titles[i], size=12) pl.xticks(()) pl.yticks(()) plot_gallery(X, names[y], h, w) pl.figure(figsize=(14, 3)) y_unique = np.unique(y) counts = [(y == i).sum() for i in y_unique] pl.xticks(y_unique, names[y_unique]) locs, labels = pl.xticks() pl.setp(labels, rotation=45, size=20) _ = pl.bar(y_unique, counts) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) from sklearn.decomposition import RandomizedPCA n_components = 150 print "Extracting the top %d eigenfaces from %d faces" % ( n_components, X_train.shape[0]) pca = RandomizedPCA(n_components=n_components, whiten=True) %time pca.fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] plot_gallery(eigenfaces, eigenface_titles, h, w) X_train_pca = pca.transform(X_train) from sklearn.svm import SVC svm = SVC(kernel='rbf', class_weight='auto') svm from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.cross_validation import cross_val_score cv = StratifiedShuffleSplit(y_train, test_size=0.20, n_iter=3) %time svm_cv_scores = cross_val_score(svm, X_train_pca, y_train, scoring='f1', n_jobs=2) svm_cv_scores svm_cv_scores.mean(), svm_cv_scores.std() from sklearn.grid_search import GridSearchCV param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(svm, param_grid, scoring='f1', cv=cv, n_jobs=2) %time clf = clf.fit(X_train_pca, y_train) print("Best estimator found by randomized hyper parameter search:") print(clf.best_params_) print("Best parameters validation score: {:.3f}".format(clf.best_score_)) X_test_pca = pca.transform(X_test) y_pred = clf.predict(X_test_pca) def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] return 'predicted: %s\ntrue: %s' % (pred_name, true_name) prediction_titles = [title(y_pred, y_test, names, i) for i in range(y_pred.shape[0])] plot_gallery(X_test, prediction_titles, h, w) from sklearn.metrics import classification_report print(classification_report(y_test, y_pred, target_names=names)) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred, labels=range(n_classes)) print(cm) pl.gray() _ = pl.imshow(cm, interpolation='nearest')