from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
# selecting out the 2nd and 3rd attributes, 'petal length' and 'petal width'
X = iris.data[:, [2, 3]]
y = iris.target
Note that we are workign with just 2 features so we can continue to visualize in 2-d and also that the target values are already in integer format; 0, 1, 2 correspond to Iris-Setosa, Iris-Versicolor, and Iris-Virginica.
np.unique(y)
array([0, 1, 2])
There will be more on this later in the book, but let's split up the dataset so we can cross validate.
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
Let's also apply basic feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
And we can apply the built in version of the perceptron algorithm we authored in Chapter 2.
from sklearn.linear_model import Perceptron
ppn = Perceptron(n_iter=40, eta0=0.01, random_state=0)
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
Misclassified samples: 4
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.91
It's interesting that when evaluating on unseen data we miss some points in the test set; in chapter 2 we were merely observing that we ultimately are able to fit all of the training data. Let's verify that the skikit-learn ppn also fully fits the training data:
y_train_predict = ppn.predict(X_train_std)
print('Misclassified samples: %d' % (y_train != y_train_predict).sum())
print('Accuracy on training data: %.2f' % accuracy_score(
y_train,
y_train_predict))
Misclassified samples: 10 Accuracy on training data: 0.90
Huh, I guess the scikit-learn perceptron is more careful not to overfit when training? Why else would it not have fit the training data 100%?
After reading ahead, it turns out we're looking at a larger sample of data that include 3 classes of flower and cannot be linearly separated. See below for the deets.
Now let's plot the decision regions. This version is improved from chapter 2
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# %load decision_region_plot.py
"""
Helper function for plotting decision regions from Chapter 3 of Python Machine Learning.
It's improved on the Chapter 3 version in that it can plot the test vs training set differently.
"""
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings
def versiontuple(v):
return tuple(map(int, (v.split("."))))
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx]
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl)
# highlight test samples
if test_idx:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0], X_test[:, 1], c='',
alpha=1.0, linewidth=1, marker='o',
s=55, label='test set')
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined_std, y=y_combined,
classifier=ppn, test_idx=range(105,150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
plt.show()
Let's explore logistic regression, one of the most widely used linear classification algorithms and a more realistic choice than the perceptron.
See my other notebook exploring some of the intuition behind why one would use a logistic function / sigmoid in the first place.
Let's see what a sigmoid function looks like, as it is what's used for the activation function in linear regresion.
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
z = np.arange(-7, 7, 0.1)
phi_z = sigmoid(z)
plt.plot(z, phi_z)
plt.axvline(0.0, color='k')
plt.ylim(-0.1, 1.1)
plt.xlabel('z')
plt.ylabel('$\phi (z)$')
# y axis ticks and gridline
plt.yticks([0.0, 0.5, 1.0])
ax = plt.gca()
ax.yaxis.grid(True)
plt.tight_layout()
plt.show()
Scikit-learn has a nicely optimized logistic regression classifier that supports multiclass settings, let's give it a whirl.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()
A handy feature is that we can get the class membership probability of an individual sample, something that wasn't available with our single-layer neural network classifiers from chapter 2:
["%.3f" % el for el in lr.predict_proba(X_test_std[0,:])[0]]
['0.000', '0.063', '0.937']
The book notes
If we were to implement logistic regression ourselves, we could simply substitute the cost function J in our Adaline implementation from Chapter 2
The author has published a bonus notebook doing just that. In chapter 2 I'd taken a slightly different approach to implementing Adeline, using a higher order function instead of a class. Here is my updated version:
# %load kr_decision_region_plot.py
"""
Helper function for plotting decision regions from Chapter 2 of Python Machine Learning.
Adapted to work with my different approach to implementation.
"""
from matplotlib.colors import ListedColormap
def kr_plot_decision_regions(plt, observations, labels, predict_fn, weights, resolution=0.02,
xlabel='sepal length [cm]', ylabel='petal length [cm]'):
# we define a number of colors and markers and create a color map from
# the list of colors via ListedColormap
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(labels))])
# We determine the minimum and maximum values for the two features and use those
# feature vectors to create a pair of grid arrays xx1 and xx2 via the NumPy meshgrid function
x1_min, x1_max = observations[:, 0].min() - 1, observations[:, 0].max() + 1
x2_min, x2_max = observations[:, 1].min() - 1, observations[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
# Since we trained our perceptron classifier on two feature dimensions,
# we need to flatten the grid arrays and create a matrix that has the same number of
# columns as the Iris training subset so that we can use the predict method to
# predict the class labels Z of the corresponding grid points.
Z = predict_fn(np.array([xx1.ravel(), xx2.ravel()]).T, weights)
Z = Z.reshape(xx1.shape)
# After reshaping the predicted class labels Z into a grid with the same dimensions
# as xx1 and xx2, we can now draw a contour plot via matplotlib's contourf function
# that maps the different decision regions to different colors for each
# predicted class in the grid array
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# plot class samples
for idx, cl in enumerate(np.unique(labels)):
plt.scatter(x=observations[labels == cl, 0], y=observations[labels == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
# %load homegrown_lr.py
"""
Implementation of logistic regression for chapter 3 of "Python Machine Learning".
This is adapted from chapter 2's implementation of ADAptive LInear NEuron (Adaline).
"""
import numpy as np
def homegrown_lr(observations, labels, learning_rate=0.02, max_training_iterations=100):
"""
Trains a (binary) perceptron, returning a function that can predict / classify
given a new observations, as well as insight into how the training progressed via
a log of the weights and squared errors for each iteration.
:param observations: array of rows
:param labels: correct label classification for each row: [1, -1, 1, 1, ...]
:param learning_rate: how fast to update weights
:param max_training_iterations: max number of times to iterate through observations
:return: (prediction_fn, weights_log, errors_log)
"""
the_weights = np.zeros(1 + observations.shape[1])
weights_log = []
num_errors_log = []
squared_error_log = []
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def net_input(observations, weights):
return np.dot(observations, weights[1:]) + weights[0]
def activation(observations, weights):
return sigmoid(net_input(observations, weights))
def quantized_output(output):
return np.where(output >= 0.5, 1, 0)
def predict(observations, weights=the_weights):
return quantized_output(activation(observations, weights))
for _ in range(max_training_iterations):
weights_log.append(np.copy(the_weights))
raw_outputs = activation(observations, the_weights)
errors = labels - raw_outputs
weight_deltas = learning_rate * np.dot(observations.transpose(), errors)
the_weights[1:] += weight_deltas
the_weights[0] += learning_rate * np.sum(errors)
squared_errors = (errors ** 2).sum() / 2.0
num_errors = (quantized_output(raw_outputs) != labels).sum()
squared_error_log.append(squared_errors)
num_errors_log.append(num_errors)
if num_errors == 0:
break
return predict, weights_log, squared_error_log, num_errors_log
And here is an application of it applied to a subset of the Iris dataset that only has two classes (my implementation has not been adapted to perform one-vs-rest to extend to multiple classes, so as in chapter 2, we only look at a binary example).
import pandas as pd
df = pd.read_csv('iris.data', header=None)
df.tail()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
# Get the first 100 rows, selecting columns 0 and 2 (sepal and petal length)
sepal_petal_rows = df.iloc[0:100, [0, 2]].values
# select setosa and versicolor
y = df.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', 1, 0)
# standardize
sepal_petal_rows_std = np.copy(sepal_petal_rows)
for i in range(sepal_petal_rows_std.shape[1]):
sepal_petal_rows_std[:, i] = (sepal_petal_rows[:,i] - sepal_petal_rows[:,i].mean()) / sepal_petal_rows[:,i].std()
predict_lr_fn, weights_log, squared_error_log, num_errors_log = homegrown_lr(
sepal_petal_rows_std, y, learning_rate=0.05)
plt.title('Homegrown Logistic Regression')
kr_plot_decision_regions(
plt, sepal_petal_rows_std, y, predict_lr_fn, weights_log[-1],
xlabel = 'sepal length [standardized]',
ylabel = 'petal length [standardized]')
plt.plot(range(1, len(num_errors_log) + 1), num_errors_log, marker='o')
plt.xlabel('Epochs')
plt.ylabel('Num misclassifications')
plt.show()
weights = []
params = []
training_fits = []
test_fits = []
for i, c in enumerate(np.arange(-5, 5)):
lr = LogisticRegression(C=10**c, random_state=0)
lr.fit(X_train_std, y_train)
training_accuracy = accuracy_score(y_train, lr.predict(X_train_std))
test_accuracy=accuracy_score(y_test, lr.predict(X_test_std))
# print("with c = 10**{c}:\nweights: {weights}\nfit {training_accuracy:.2f} training\ngeneralized to {test_accuracy:.2f} test\n".format(
# c=c,
# weights=lr.coef_[1],
# training_accuracy=training_accuracy,
# test_accuracy=test_accuracy
# ))
weights.append(lr.coef_[1])
params.append(10**c)
training_fits.append(training_accuracy)
test_fits.append(test_accuracy)
if i == 0 or i == 9:
plot_decision_regions(X_combined_std, y_combined,
classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.title("C = 10**{c}".format(c=c))
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()
weights = np.array(weights)
plt.plot(params, weights[:, 0],
label='petal length')
plt.plot(params, weights[:, 1], linestyle='--',
label='petal width')
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.xscale('log')
plt.show()
plt.plot(params, training_fits, label='training accuracy')
plt.plot(params, test_fits, label='test accuracy')
plt.ylabel('accuracy')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.xscale('log')
plt.show()
It's interesting that for logistic regression, decreasing the effect of regularization only seems to help the training; there is no overfitting of the training data set that doesn't generalize to the test data set.
I guess it makes sense; you can't really overfit too badly with a linear model, it's not like you are curving around to fit the data.
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
np.random.seed(0)
X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0,
X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)
plt.scatter(X_xor[y_xor == 1, 0],
X_xor[y_xor == 1, 1],
c='b', marker='x',
label='1')
plt.scatter(X_xor[y_xor == -1, 0],
X_xor[y_xor == -1, 1],
c='r',
marker='s',
label='-1')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('./figures/xor.png', dpi=300)
plt.show()
Linear SVM can't cut it, but rbf can
for model in [svm, SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)]:
model.fit(X_xor, y_xor)
plot_decision_regions(X_xor, y_xor, classifier=model)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
print('Accuracy: %.2f' % accuracy_score(y_xor, model.predict(X_xor)))
Accuracy: 0.67
Accuracy: 0.95
Here we can see we can tune the $\gamma$ parameter to increase variance—all the way to the point where we have overfit and get worse accuracy on the training set:
for gamma in [0.2, 0.1, 100]:
svm = SVC(kernel='rbf', random_state=0, gamma=gamma, C=10.0)
svm.fit(X_train_std, y_train)
training_accuracy = accuracy_score(y_train, svm.predict(X_train_std))
test_accuracy=accuracy_score(y_test, svm.predict(X_test_std))
print("with gamma = {gamma:.2f}:\nfit {training_accuracy:.2f} training\ngeneralized to {test_accuracy:.2f} test\n".format(
gamma=gamma,
training_accuracy=training_accuracy,
test_accuracy=test_accuracy
))
plot_decision_regions(X_combined_std, y_combined,
classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.title("SVM with $\gamma$ = {gamma}".format(gamma=gamma))
plt.show()
with gamma = 0.20: fit 0.95 training generalized to 0.98 test
with gamma = 0.10: fit 0.95 training generalized to 0.98 test
with gamma = 100.00: fit 0.99 training generalized to 0.80 test
And let's revisit the bias / variance tradeoff again by looking at different values of the regularization parameter C
for i, c in enumerate(np.arange(-1, 2)):
svm = SVC(kernel='rbf', random_state=0, gamma=0.1, C=10**c)
svm.fit(X_train_std, y_train)
training_accuracy = accuracy_score(y_train, svm.predict(X_train_std))
test_accuracy=accuracy_score(y_test, svm.predict(X_test_std))
print("with C = 10**{c:.2f}:\nfit {training_accuracy:.2f} training\ngeneralized to {test_accuracy:.2f} test\n".format(
c=c,
training_accuracy=training_accuracy,
test_accuracy=test_accuracy
))
plot_decision_regions(X_combined_std, y_combined,
classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.title("SVM with c = {c}".format(c=10**c))
plt.show()
with C = 10**-1.00: fit 0.82 training generalized to 0.69 test
with C = 10**0.00: fit 0.95 training generalized to 0.98 test
with C = 10**1.00: fit 0.95 training generalized to 0.98 test
Working along with the book, here are the three measures of impurity visualized:
import matplotlib.pyplot as plt
import numpy as np
def gini(p):
return p * (1 - p) + (1 - p) * (1 - (1 - p))
def entropy(p):
return - p * np.log2(p) - (1 - p) * np.log2((1 - p))
def error(p):
return 1 - np.max([p, 1 - p])
x = np.arange(0.0, 1.0, 0.01)
ent = [entropy(p) if p != 0 else None for p in x]
sc_ent = [e * 0.5 if e else None for e in ent]
err = [error(i) for i in x]
fig = plt.figure()
ax = plt.subplot(111)
for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err],
['Entropy', 'Entropy (scaled)',
'Gini Impurity', 'Misclassification Error'],
['-', '-', '--', '-.'],
['black', 'lightgray', 'red', 'green', 'cyan']):
line = ax.plot(x, i, label=lab, linestyle=ls, lw=2, color=c)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15),
ncol=3, fancybox=True, shadow=False)
ax.axhline(y=0.5, linewidth=1, color='k', linestyle='--')
ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--')
plt.ylim([0, 1.1])
plt.xlabel('p(i=1)')
plt.ylabel('Impurity Index')
plt.tight_layout()
#plt.savefig('./figures/impurity.png', dpi=300, bbox_inches='tight')
plt.show()
Next, let's build a tree in scikit and train it on the same data set as with the other models:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
tree.fit(X_train, y_train)
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X_combined, y_combined,
classifier=tree, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/decision_tree_decision.png', dpi=300)
plt.show()
print("Accuracy score: {}".format(accuracy_score(y_test, tree.predict(X_test))))
Accuracy score: 0.9777777777777777
Random forests are cool. Here's an example. Note that the accuracy is slightly lower, but we can chalk that up to the forest's robustness against overfitting.
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy',
n_estimators=10,
random_state=1,
n_jobs=2)
forest.fit(X_train, y_train)
plot_decision_regions(X_combined, y_combined,
classifier=forest, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
print("Accuracy score: {}".format(accuracy_score(y_test, forest.predict(X_test))))
Accuracy score: 0.9555555555555556
Let's explore the effect of overfitting with too deep a tree on how well the model generalizes to the test dataset.
for depth in [1, 3, 5, 9]:
tree_d = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=0)
tree_d.fit(X_train, y_train)
plot_decision_regions(X_combined, y_combined,
classifier=tree_d, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.title("Tree depth {}".format(depth))
plt.tight_layout()
plt.show()
print('Training accuracy: %.2f' % accuracy_score(y_train, tree_d.predict(X_train)))
print('Test accuracy: %.2f' % accuracy_score(y_test, tree_d.predict(X_test)))
Training accuracy: 0.70 Test accuracy: 0.60
Training accuracy: 0.98 Test accuracy: 0.98
Training accuracy: 0.99 Test accuracy: 0.96
Training accuracy: 0.99 Test accuracy: 0.96
Looks like for this dataset having a higher max depth didn't really hurt us.
Let's now see how the trees work for nonlinearly separable data sets as well. Only takes a depth of 2 to nail it
for depth in [1, 2, 3]:
tree_d = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=0)
tree_d.fit(X_xor, y_xor)
plot_decision_regions(X_xor, y_xor, classifier=tree_d)
plt.legend(loc='upper left')
plt.tight_layout()
plt.title("Tree Depth {}".format(depth))
plt.show()
print('Accuracy: %.2f' % accuracy_score(y_xor, tree_d.predict(X_xor)))
Accuracy: 0.60
Accuracy: 0.92
Accuracy: 1.00
KNN is another off the shelf algorithm we can easily apply:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=knn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
print('Training Accuracy: %.2f' % accuracy_score(y_train, knn.predict(X_train_std)))
print('Test Accuracy: %.2f' % accuracy_score(y_test, knn.predict(X_test_std)))
Training Accuracy: 0.95 Test Accuracy: 1.00