%pylab inline import sklearn # Visualizes how a classifier would classify each point in a grid # http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html from matplotlib.colors import ListedColormap def decision_boundary(clf, X, Y): h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max] x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.show() def plot_test_train(clf, Xtrain, Ytrain, Xtest): plt.prism() # this sets a nice color map plt.scatter(Xtest[:, 0], Xtest[:, 1], c=clf.predict(Xtest), marker='^') plt.scatter(Xtrain[:, 0], Xtrain[:, 1], c=Ytrain) from sklearn.datasets import load_digits digits = load_digits() print("images shape: %s" % str(digits.images.shape)) print("targets shape: %s" % str(digits.target.shape)) digit_X = digits.images.reshape(-1, 64) # Reshape 8x8 images to length 64 vectors digit_Y = digits.target # Get labels plt.matshow(digits.images[0], cmap=plt.cm.Greys); from sklearn.datasets import load_iris iris = load_iris() print iris.data.shape IX = iris.data # Get features IY = iris.target # Get labels print("X shape: {}".format(IX.shape)) print("Example features:\n {}".format(IX[:5])) print("Labels:\n {}".format(IY[:70])) from sklearn.datasets import make_blobs BX, BY = make_blobs(cluster_std=1.6, random_state=9) plt.scatter(BX[:, 0], BX[:, 1], c=BY);plt.show() from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) knn.fit(BX, BY) decision_boundary(knn, BX, BY) knn = KNeighborsClassifier(n_neighbors=10) knn.fit(BX, BY) decision_boundary(knn, BX, BY) X, Y = sklearn.utils.shuffle(IX, IY) for k in [1,3,5]: knn = KNeighborsClassifier(n_neighbors=k) for n in [10, 50, 100]: knn.fit(X[:n], Y[:n]) print("{} {}: {}".format(k, n, knn.score(X[n:], Y[n:]))) print from sklearn.cross_validation import train_test_split # Test on 1/3 of data X_train, X_test, Y_train, Y_test = train_test_split(IX, IY, test_size=0.33) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, Y_train) knn.score(X_test, Y_test) from sklearn.cross_validation import KFold from sklearn.metrics import accuracy_score def score(clf, X, Y, folds=2, verbose=False, metric=accuracy_score): predictions = np.zeros(len(Y)) for i, (train, test) in enumerate(KFold(len(X), n_folds=folds, shuffle=True)): clf.fit(X[train], Y[train]) predictions[test] = clf.predict(X[test]) if verbose: print("Fold {}: {}".format(i + 1, accuracy_score(Y[test], predictions[test]))) if metric: return metric(Y, predictions) return Y, predictions for k in range(1, 10, 2): acc = score(KNeighborsClassifier(n_neighbors=k), IX, IY, folds=30) print("{}: {}".format(k, acc)) from sklearn import tree dt = tree.DecisionTreeClassifier() dt.fit(BX, BY) decision_boundary(dt, BX, BY) from sklearn.ensemble import RandomForestClassifier df = RandomForestClassifier(n_estimators=10) df.fit(BX, BY) decision_boundary(df, BX, BY) from sklearn import svm clf = svm.SVC(kernel='linear') clf.fit(BX, BY) decision_boundary(clf, BX, BY) from sklearn.datasets import make_circles CX, CY = make_circles(factor=0.5, noise=0.2, random_state=1) clf = svm.SVC(kernel='linear') clf.fit(CX, CY) decision_boundary(clf, CX, CY) # Illustrate linearSVM on Circle dataset clf = svm.SVC(kernel='rbf') # rbf is the default kernel type clf.fit(CX, CY) decision_boundary(clf, CX, CY) # Illustrate linearSVM on Circle dataset clf = svm.SVC() clf.fit(BX, BY) decision_boundary(clf, BX, BY) from sklearn import metrics clf = svm.SVC() y, pred = score(clf, IX, IY, metric=None) print(metrics.classification_report(y, pred)) print(metrics.confusion_matrix(y, pred)) clf = svm.SVC() y, pred = score(clf, digit_X, digit_Y, folds=10, metric=None) print(metrics.classification_report(y, pred)) print(metrics.confusion_matrix(y, pred)) clf = svm.SVC(kernel='linear') #This is a case where a different kernel helps y, pred = score(clf, digit_X, digit_Y, folds=10, metric=None) print(metrics.classification_report(y, pred)) print(metrics.confusion_matrix(y, pred)) from sklearn.datasets import load_boston data = load_boston() HX = data['data'] HY = data['target'] print data.DESCR[:1200] from sklearn import linear_model as lm y, pred = score(lm.LinearRegression(), HX, HY, folds=10, metric=None) print(metrics.mean_squared_error(y, pred)) plt.hist(HY);plt.show() # Example discovered cluster centers from sklearn import cluster km = cluster.KMeans(n_clusters=3) Y_hat = km.fit(BX).labels_ plt.scatter(BX[:,0], BX[:,1], c=BY, alpha=0.4) mu = km.cluster_centers_ plt.scatter(mu[:,0], mu[:,1], s=100, c=np.unique(Y_hat)) plt.show() from sklearn.decomposition import RandomizedPCA pca = RandomizedPCA(n_components=2) proj = pca.fit_transform(digits.data) plt.scatter(proj[:, 0], proj[:, 1], c=digits.target) plt.colorbar();plt.show() from sklearn.decomposition import RandomizedPCA pca = RandomizedPCA(n_components=2) proj = pca.fit_transform(IX) plt.scatter(proj[:, 0], proj[:, 1], c=IY) plt.colorbar();plt.show() from sklearn.manifold import Isomap iso = Isomap(n_neighbors=5, n_components=2) proj = iso.fit_transform(digits.data) plt.scatter(proj[:, 0], proj[:, 1], c=digits.target) plt.colorbar();plt.show() #from sklearn.manifold import MDS #mds = MDS() #proj = mds.fit_transform(digit_X) #plt.scatter(proj[:, 0], proj[:, 1], c=digit_Y) #plt.colorbar();plt.show() from sklearn.grid_search import GridSearchCV param_grid = [ {'C': [1, 10], 'kernel': ['linear']}, {'C': [1, 10], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ] gs = GridSearchCV(svm.SVC(), param_grid) gs.fit(digit_X, digit_Y) # Let's try it on MNIST print(gs.best_params_) print(gs.grid_scores_) gs.fit(IX, IY) # Different settings work better for the iris dataset print(gs.best_params_) print(gs.grid_scores_) from sklearn.pipeline import Pipeline pca = RandomizedPCA(n_components=16) clf = svm.SVC(kernel='linear') pipeline = Pipeline(steps=[('PCA', pca), ('SVM', clf)]) # fit/predict work in the same way as other classifiers # pipeline.fit(X[train], Y[train]) # pipeline.predict(X[test], Y[test]) score(pipeline, digit_X, digit_Y, folds=10)