STAT 479: Machine Learning (Fall 2018)
Instructor: Sebastian Raschka (sraschka@wisc.edu)
Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat479-fs2018/
%load_ext watermark
%watermark -d -u -a 'Sebastian Raschka' -v -p numpy,scipy,matplotlib,sklearn
Sebastian Raschka last updated: 2018-09-09 CPython 3.6.6 IPython 6.5.0 numpy 1.15.0 scipy 1.1.0 matplotlib 2.2.2 sklearn 0.19.1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df_iris = pd.read_csv('iris.csv')
df_iris.tail()
Id | SepalLength[cm] | SepalWidth[cm] | PetalLength[cm] | PetalWidth[cm] | Species | |
---|---|---|---|---|---|---|
145 | 146 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 147 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 148 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 149 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 150 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
X = df_iris[['PetalLength[cm]', 'PetalWidth[cm]']].values
X[:5, :]
array([[1.4, 0.2], [1.4, 0.2], [1.3, 0.2], [1.5, 0.2], [1.4, 0.2]])
label_dict = {'Iris-setosa': 0,
'Iris-versicolor': 1,
'Iris-virginica': 2}
df_iris['ClassLabel'] = df_iris['Species'].map(label_dict)
df_iris.tail()
Id | SepalLength[cm] | SepalWidth[cm] | PetalLength[cm] | PetalWidth[cm] | Species | ClassLabel | |
---|---|---|---|---|---|---|---|
145 | 146 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica | 2 |
146 | 147 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica | 2 |
147 | 148 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica | 2 |
148 | 149 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica | 2 |
149 | 150 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica | 2 |
y = df_iris['ClassLabel'].values
y[:5]
array([0, 0, 0, 0, 0])
indices = np.arange(y.shape[0])
indices
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149])
rnd = np.random.RandomState(123)
shuffled_indices = rnd.permutation(indices)
shuffled_indices
array([ 72, 112, 132, 88, 37, 138, 87, 42, 8, 90, 141, 33, 59, 116, 135, 104, 36, 13, 63, 45, 28, 133, 24, 127, 46, 20, 31, 121, 117, 4, 130, 119, 29, 0, 62, 93, 131, 5, 16, 82, 60, 35, 143, 145, 142, 114, 136, 53, 19, 38, 110, 23, 9, 86, 91, 89, 79, 101, 65, 115, 41, 124, 95, 21, 11, 103, 74, 122, 118, 44, 51, 81, 149, 12, 129, 56, 50, 25, 128, 146, 43, 1, 71, 54, 100, 14, 6, 80, 26, 70, 139, 30, 108, 15, 18, 77, 22, 10, 58, 107, 75, 64, 69, 3, 40, 76, 134, 34, 27, 94, 85, 97, 102, 52, 92, 99, 105, 7, 48, 61, 120, 137, 125, 147, 39, 84, 2, 67, 55, 49, 68, 140, 78, 144, 111, 32, 73, 47, 148, 113, 96, 57, 123, 106, 83, 17, 98, 66, 126, 109])
X_shuffled, y_shuffled = X[shuffled_indices], y[shuffled_indices]
X_train, y_train = X_shuffled[:100], y_shuffled[:100]
X_test, y_test = X_shuffled[100:], y_shuffled[100:]
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X, y = iris.data[:, 2:], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3,
random_state=123,
shuffle=True)
plt.scatter(X_train[y_train == 0, 0],
X_train[y_train == 0, 1],
marker='o',
label='class 0 (Setosa)')
plt.scatter(X_train[y_train == 1, 0],
X_train[y_train == 1, 1],
marker='^',
label='class 1 (Versicolor)')
plt.scatter(X_train[y_train == 2, 0],
X_train[y_train == 2, 1],
marker='s',
label='class 2 (Virginica)')
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.show()
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2, weights='uniform')
y_pred = knn_model.predict(X_test)
num_correct_predictions = (y_pred == y_test).sum()
accuracy = (num_correct_predictions / y_test.shape[0]) * 100
print('Test set accuracy: %.2f%%' % accuracy)
Test set accuracy: 95.56%
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X_train, y_train, knn_model)
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.show()
plot_decision_regions(X_test, y_test, knn_model)
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.show()