%matplotlib inline import numpy as np import matplotlib.pyplot as plt # use seaborn for plot defaults # this can be safely commented out import seaborn; seaborn.set() from sklearn.linear_model import LinearRegression model = LinearRegression(normalize=True) print(model.normalize) print(model) x = np.arange(10) y = 2 * x + 1 print(x) print(y) plt.plot(x, y, 'o'); # The input data for sklearn is 2D: (samples == 3 x features == 1) X = x[:, np.newaxis] print(X) print(y) # fit the model on our data model.fit(X, y) # underscore at the end indicates a fit parameter print(model.coef_) print(model.intercept_) # residual error around fit model.residues_ from sklearn import neighbors, datasets iris = datasets.load_iris() X, y = iris.data, iris.target # create the model knn = neighbors.KNeighborsClassifier(n_neighbors=5) # fit the model knn.fit(X, y) # What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal? # call the "predict" method: result = knn.predict([[3, 5, 4, 2],]) print(iris.target_names[result]) knn.predict_proba([[3, 5, 4, 2],]) from fig_code import plot_iris_knn plot_iris_knn() from sklearn.svm import SVC # Create some simple data import numpy as np np.random.seed(0) X = np.random.random(size=(20, 1)) y = 3 * X.squeeze() + 2 + np.random.randn(20) plt.plot(X.squeeze(), y, 'o'); model = LinearRegression() model.fit(X, y) # Plot the data and the model prediction X_fit = np.linspace(0, 1, 100)[:, np.newaxis] y_fit = model.predict(X_fit) plt.plot(X.squeeze(), y, 'o') plt.plot(X_fit.squeeze(), y_fit); # Fit a Random Forest from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() model.fit(X, y) # Plot the data and the model prediction X_fit = np.linspace(0, 1, 100)[:, np.newaxis] y_fit = model.predict(X_fit) plt.plot(X.squeeze(), y, 'o') plt.plot(X_fit.squeeze(), y_fit); X, y = iris.data, iris.target from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X) X_reduced = pca.transform(X) print("Reduced dataset shape:", X_reduced.shape) import pylab as pl pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='RdYlBu') print("Meaning of the 2 components:") for component in pca.components_: print(" + ".join("%.3f x %s" % (value, name) for value, name in zip(component, iris.feature_names))) from sklearn.cluster import KMeans k_means = KMeans(n_clusters=3, random_state=0) # Fixing the RNG in kmeans k_means.fit(X) y_pred = k_means.predict(X) pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_pred, cmap='RdYlBu'); from sklearn.neighbors import KNeighborsClassifier X, y = iris.data, iris.target clf = KNeighborsClassifier(n_neighbors=1) clf.fit(X, y) y_pred = clf.predict(X) print(np.all(y == y_pred)) from sklearn.metrics import confusion_matrix print(confusion_matrix(y, y_pred)) from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(X, y) clf.fit(Xtrain, ytrain) ypred = clf.predict(Xtest) print(confusion_matrix(ytest, ypred)) from IPython.display import Image Image("http://scikit-learn.org/dev/_static/ml_map.png") from sklearn import datasets digits = datasets.load_digits() digits.images.shape fig, axes = plt.subplots(10, 10, figsize=(8, 8)) fig.subplots_adjust(hspace=0.1, wspace=0.1) for i, ax in enumerate(axes.flat): ax.imshow(digits.images[i], cmap='binary') ax.text(0.05, 0.05, str(digits.target[i]), transform=ax.transAxes, color='green') ax.set_xticks([]) ax.set_yticks([]) # The images themselves print(digits.images.shape) print(digits.images[0]) # The data for use in our algorithms print(digits.data.shape) print(digits.data[0]) # The target label print(digits.target) from sklearn.manifold import Isomap iso = Isomap(n_components=2) data_projected = iso.fit_transform(digits.data) data_projected.shape plt.scatter(data_projected[:, 0], data_projected[:, 1], c=digits.target, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('nipy_spectral', 10)); plt.colorbar(label='digit label', ticks=range(10)) plt.clim(-0.5, 9.5) from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data, digits.target, random_state=2) print(Xtrain.shape, Xtest.shape) from sklearn.linear_model import LogisticRegression clf = LogisticRegression(penalty='l2') clf.fit(Xtrain, ytrain) ypred = clf.predict(Xtest) from sklearn.metrics import accuracy_score accuracy_score(ytest, ypred) from sklearn.metrics import confusion_matrix print(confusion_matrix(ytest, ypred)) plt.imshow(np.log(confusion_matrix(ytest, ypred)), cmap='Blues', interpolation='nearest') plt.grid(False) plt.ylabel('true') plt.xlabel('predicted'); fig, axes = plt.subplots(10, 10, figsize=(8, 8)) fig.subplots_adjust(hspace=0.1, wspace=0.1) for i, ax in enumerate(axes.flat): ax.imshow(Xtest[i].reshape(8, 8), cmap='binary') ax.text(0.05, 0.05, str(ypred[i]), transform=ax.transAxes, color='green' if (ytest[i] == ypred[i]) else 'red') ax.set_xticks([]) ax.set_yticks([])