Applied excercises from Chapter 9 of An Introduction to Statistical Learning by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani.
import numpy as np
import pandas as pd
import patsy as pt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_moons, make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
# Generate noisy moon shaped data
n_samples = 100
noise = .3
X_train, y_train = make_moons(n_samples=n_samples, noise=noise, random_state=0)
X_test, y_test = make_moons(n_samples=n_samples, noise=noise, random_state=1)
# Scale data
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
# Plot data
df = pd.concat([pd.DataFrame(data=X_train, columns=['x1', 'x2']), pd.Series(y_train, name='y')], axis=1)
plt.figure(figsize=(10, 10))
sns.scatterplot(x='x1', y='x2', hue='y', data=df);
def plot_clf(model, df, grid_range, show_contours=False, show_support_vectors=False):
# Decision boundary plot
# Get grid of values in given range
x1 = grid_range
x2 = grid_range
xx1, xx2 = np.meshgrid(x1, x2, sparse=False)
Xgrid = np.stack((xx1.flatten(), xx2.flatten())).T
# Get decision boundary values for plot grid
decision_boundary = model.predict(Xgrid)
decision_boundary_grid = decision_boundary.reshape(len(x2), len(x1))
# Get decision function values for plot grid
decision_function = model.decision_function(Xgrid)
decision_function_grid = decision_function.reshape(len(x2), len(x1))
fig = plt.figure(figsize=(10, 10))
if show_contours:
plt.contourf(x1, x2, decision_function_grid);
plt.contour(x1, x2, decision_boundary_grid);
sns.scatterplot(x='x1', y='x2', hue='y', data=df)
if show_support_vectors:
sns.scatterplot(x=model.support_vectors_[:,0], y=model.support_vectors_[:,1], color='red', marker='+', s=500)
plt.show();
model = svm.SVC(kernel='linear', gamma=1, C=1, random_state=0, probability=True).fit(X_train, y_train)
plot_clf(model, df, np.arange(-3, 3, .1), show_contours=True)
print(f'Training accuracy: {model.score(X_train, y_train)}')
print(f'Test accuracy : {model.score(X_test, y_test)}')
Training accuracy: 0.85 Test accuracy : 0.83
model = svm.SVC(kernel='poly', degree=3, gamma=1, C=1, random_state=0, probability=True).fit(X_train, y_train)
plot_clf(model, df, np.arange(-3, 3, .1), show_contours=True)
print(f'Training accuracy: {model.score(X_train, y_train)}')
print(f'Test accuracy : {model.score(X_test, y_test)}')
Training accuracy: 0.85 Test accuracy : 0.87
model = svm.SVC(kernel='rbf', gamma=1, C=1, random_state=0, probability=True).fit(X_train, y_train)
plot_clf(model, df, np.arange(-3, 3, .1), show_contours=True)
print(f'Training accuracy: {model.score(X_train, y_train)}')
print(f'Test accuracy : {model.score(X_test, y_test)}')
Training accuracy: 0.93 Test accuracy : 0.91
Comment In this setting an SVM with a radial kernel outperforms both linear and polynomial kernel models. The linear kernel model is the least effective because the data is not linearly separable.
(a) Generate a data set with n = 500 and p = 2, such that the observations belong to two classes with a quadratic decision boundary between them. For instance, you can do this as follows:
np.random.seed(0)
x1 = np.random.uniform(0, 1, 500) - 0.5
x2 = np.random.uniform(0, 1, 500) - 0.5
y = 1*(x1**2 - x2**2 > 0)
(b) Plot the observations, colored according to their class labels. Your plot should display X1 on the x-axis, and X2 on the y- axis.
# Plot data
df = pd.DataFrame({'x1':x1, 'x2':x2, 'y':y})
plt.figure(figsize=(10, 10))
sns.scatterplot(x='x1', y='x2', hue='y', data=df);
(c) Fit a logistic regression model to the data, using X1 and X2 as predictors.
(d) Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be linear.
# Pre-process data with only linear features
f = 'y ~ x1 + x2'
y, X = pt.dmatrices(f, df)
y = np.ravel(y)
train = np.random.random(len(y)) > 0.5
# Fit model on training set
model = LogisticRegression().fit(X[train], y[train])
# Predict
y_hat = model.predict(X[train])
# Plot data
plot_df = pd.DataFrame({'x1':X[train][:,1], 'x2':X[train][:,2], 'y_hat':y_hat})
plt.figure(figsize=(10, 10))
sns.scatterplot(x='x1', y='x2', hue='y_hat', data=plot_df)
plt.show();
print(f'Training accuracy: {model.score(X[train], y[train])}')
print(f'Test accuracy : {model.score(X[~train], y[~train])}')
Training accuracy: 0.5984555984555985 Test accuracy : 0.5228215767634855
(e) Now fit a logistic regression model to the data using non-linear functions of X1 and X2 as predictors (e.g. X12, X1 ×X2, log(X2), and so forth).
(f) Apply this model to the training data in order to obtain a pre- dicted class label for each training observation. Plot the ob- servations, colored according to the predicted class labels. The decision boundary should be obviously non-linear. If it is not, then repeat (a)-(e) until you come up with an example in which the predicted class labels are obviously non-linear.
# Pre-process data with quadratic features
f = 'y ~ x1 + x2 + np.power(x1, 2) + np.power(x2, 2)'
y, X = pt.dmatrices(f, df)
y = np.ravel(y)
# Fit model on training set
model = LogisticRegression().fit(X[train], y[train])
# Predict
y_hat = model.predict(X[train])
# Plot data
plot_df = pd.DataFrame({'x1':X[train][:,1], 'x2':X[train][:,2], 'y_hat':y_hat})
plt.figure(figsize=(10, 10))
sns.scatterplot(x='x1', y='x2', hue='y_hat', data=plot_df)
plt.show();
print(f'Training accuracy: {model.score(X[train], y[train])}')
print(f'Test accuracy : {model.score(X[~train], y[~train])}')
Training accuracy: 0.8957528957528957 Test accuracy : 0.8506224066390041
(g) Fit a support vector classifier to the data with X1 and X2 as predictors. Obtain a class prediction for each training observation. Plot the observations, colored according to the predicted class labels.
f = 'y ~ x1 + x2 - 1'
y, X = pt.dmatrices(f, df)
y = np.ravel(y)
model = svm.SVC(kernel='linear', gamma=1, C=1, random_state=0, probability=True).fit(X[train], y[train])
plot_clf(model, df, np.arange(-.8, .9, .005), show_contours=True)
print(f'Training accuracy: {model.score(X[train], y[train])}')
print(f'Test accuracy : {model.score(X[~train], y[~train])}')
Training accuracy: 0.6023166023166023 Test accuracy : 0.5186721991701245
(h) Fit a SVM using a non-linear kernel to the data. Obtain a class prediction for each training observation. Plot the observations, colored according to the predicted class labels.
f = 'y ~ x1 + x2 - 1'
y, X = pt.dmatrices(f, df)
y = np.ravel(y)
model = svm.SVC(kernel='rbf', gamma=1, C=1, random_state=0, probability=True).fit(X[train], y[train])
plot_clf(model, df, np.arange(-.9, .9, .005), show_contours=True)
print(f'Training accuracy: {model.score(X[train], y[train])}')
print(f'Test accuracy : {model.score(X[~train], y[~train])}')
Training accuracy: 0.9420849420849421 Test accuracy : 0.9336099585062241
(i) Comment on your results.
Polynomial logistic regression and SVM with radial kernel achieve similar fits to the data. The SVM performs best on the test set with an accuracy of 93% compared to 85% from logistic regression.
The linear support vector classifier achieves poor results because the seperation between classes is far from linear.
(a) Generate two-class data with p = 2 in such a way that the classes are just barely linearly separable.
# Generate sample data
X, _ = make_blobs(n_samples=200, centers=1, n_features=2, random_state=0)
y = np.zeros(len(X))
y[np.sum(X, axis=1) > 5] = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
# Scale data
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
# Plot data
plot_df = pd.DataFrame({'x1':X_train[:,0], 'x2':X_train[:,1], 'y':y_train})
plt.figure(figsize=(10, 10))
sns.scatterplot(x='x1', y='x2', hue='y', data=plot_df)
plt.show();
(b) Compute the cross-validation error rates for support vector classifiers with a range of cost values. How many training errors are misclassified for each value of cost considered, and how does this relate to the cross-validation errors obtained?
(c) Generate an appropriate test data set, and compute the test errors corresponding to each of the values of cost considered. Which value of cost leads to the fewest test errors, and how does this compare to the values of cost that yield the fewest training errors and the fewest cross-validation errors?
costs = np.logspace(-5, 10, 10)
scores = []
for i in costs:
# Get cv score
model = svm.SVC(kernel='linear', C=i, random_state=0)
train_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
# Get test score
model = svm.SVC(kernel='linear', C=i, random_state=0).fit(X_train, y_train)
test_score = model.score(X_test, y_test)
scores += [[i, train_score, test_score]]
columns=['Cost', 'train_accuracy', 'test_accuracy']
results_df = pd.DataFrame(data=np.asarray(scores), columns=columns)
display(results_df)
Cost | train_accuracy | test_accuracy | |
---|---|---|---|
0 | 1.000000e-05 | 0.530050 | 0.57 |
1 | 4.641589e-04 | 0.530050 | 0.57 |
2 | 2.154435e-02 | 0.928847 | 1.00 |
3 | 1.000000e+00 | 0.940376 | 0.94 |
4 | 4.641589e+01 | 0.969950 | 0.96 |
5 | 2.154435e+03 | 1.000000 | 0.97 |
6 | 1.000000e+05 | 1.000000 | 0.97 |
7 | 4.641589e+06 | 1.000000 | 0.97 |
8 | 2.154435e+08 | 1.000000 | 0.97 |
9 | 1.000000e+10 | 1.000000 | 0.97 |
results_df['log(Cost)'] = np.log(results_df['Cost'])
plt.figure(figsize=(10,10))
sns.lineplot(x='log(Cost)', y='train_accuracy', data=results_df, label='train')
sns.lineplot(x='log(Cost)', y='test_accuracy', data=results_df, label='test')
plt.ylabel('accuracy')
Text(0,0.5,'accuracy')
(d) Discuss your results.
Comment For high values of Cost the model achieves perfect training accuracy. The best test accuracy is also perfect but this is achieved for a lower cost value.
This suggests that the model starts to overfit as cost becomes too high.
(a) Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.
auto_df = pd.read_csv('./data/Auto.csv')
# Remove observations with missing values
auto_df = auto_df.drop(auto_df[auto_df.values == '?'].index)
#auto_df = auto_df.reset_index()
# convet quantitive values to floats
#quants = ['cylinders', 'horsepower', 'weight', 'year']
quants = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']
auto_df[quants] = auto_df[quants].astype(np.float64)
auto_df['mpg_above_median'] = (auto_df['mpg'] > auto_df['mpg'].median()) *1.
auto_df.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | mpg_above_median | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8.0 | 307.0 | 130.0 | 3504.0 | 12.0 | 70.0 | 1.0 | chevrolet chevelle malibu | 0.0 |
1 | 15.0 | 8.0 | 350.0 | 165.0 | 3693.0 | 11.5 | 70.0 | 1.0 | buick skylark 320 | 0.0 |
2 | 18.0 | 8.0 | 318.0 | 150.0 | 3436.0 | 11.0 | 70.0 | 1.0 | plymouth satellite | 0.0 |
3 | 16.0 | 8.0 | 304.0 | 150.0 | 3433.0 | 12.0 | 70.0 | 1.0 | amc rebel sst | 0.0 |
4 | 17.0 | 8.0 | 302.0 | 140.0 | 3449.0 | 10.5 | 70.0 | 1.0 | ford torino | 0.0 |
(b) Fit a support vector classifier to the data with various values of cost, in order to predict whether a car gets high or low gas mileage. Report the cross-validation errors associated with different values of this parameter. Comment on your results.
f = 'mpg_above_median ~ cylinders + displacement + horsepower + weight + acceleration + year + C(origin)'
y, X = pt.dmatrices(f, auto_df)
# Scale data
X = preprocessing.scale(X)
y = np.ravel(y)
costs = np.logspace(-5, 2, 20)
scores = []
for i in costs:
# Get cv score
model = svm.SVC(kernel='linear', C=i, random_state=0)
score = np.mean(cross_val_score(model, preprocessing.scale(X), y, cv=5))
scores += [[i, score]]
#print(f'progress: {list(costs).index(i)} of {len(costs)}')
columns=['Cost', 'CV_accuracy']
results_df = pd.DataFrame(data=np.asarray(scores), columns=columns)
display(results_df)
Cost | CV_accuracy | |
---|---|---|
0 | 0.000010 | 0.787692 |
1 | 0.000023 | 0.787692 |
2 | 0.000055 | 0.787692 |
3 | 0.000127 | 0.787692 |
4 | 0.000298 | 0.787692 |
5 | 0.000695 | 0.818205 |
6 | 0.001624 | 0.877179 |
7 | 0.003793 | 0.882308 |
8 | 0.008859 | 0.902821 |
9 | 0.020691 | 0.895385 |
10 | 0.048329 | 0.887949 |
11 | 0.112884 | 0.870385 |
12 | 0.263665 | 0.862821 |
13 | 0.615848 | 0.842821 |
14 | 1.438450 | 0.855577 |
15 | 3.359818 | 0.855641 |
16 | 7.847600 | 0.855641 |
17 | 18.329807 | 0.853077 |
18 | 42.813324 | 0.853077 |
19 | 100.000000 | 0.853077 |
results_df['log(Cost)'] = np.log(results_df['Cost'])
plt.figure(figsize=(10,10))
sns.lineplot(x='log(Cost)', y='CV_accuracy', data=results_df);
(c) Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of gamma and degree and cost. Comment on your results.
# Grid search for best classifier
# https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html#sphx-glr-auto-examples-svm-plot-rbf-parameters-py
# ----------------------------------------------------------------
C_range = np.logspace(-5, 3, 10)
gamma_range = np.logspace(-5, 3, 10)
kernels = ['rbf', 'poly']
degrees = [3, 5, 7, 9] # Using only odd values, because I noticed earlier evens are slow!
param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernels, degree=degrees)
rbf_grid = GridSearchCV(svm.SVC(cache_size=2000), param_grid=param_grid, cv=5,
scoring='accuracy', return_train_score=True)
rbf_grid.fit(X, y)
GridSearchCV(cv=5, error_score='raise', estimator=SVC(C=1.0, cache_size=2000, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), fit_params=None, iid=True, n_jobs=1, param_grid={'gamma': array([1.00000e-05, 7.74264e-05, 5.99484e-04, 4.64159e-03, 3.59381e-02, 2.78256e-01, 2.15443e+00, 1.66810e+01, 1.29155e+02, 1.00000e+03]), 'C': array([1.00000e-05, 7.74264e-05, 5.99484e-04, 4.64159e-03, 3.59381e-02, 2.78256e-01, 2.15443e+00, 1.66810e+01, 1.29155e+02, 1.00000e+03]), 'kernel': ['rbf', 'poly'], 'degree': [3, 5, 7, 9]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring='accuracy', verbose=0)
pd.DataFrame(rbf_grid.cv_results_).sort_values('rank_test_score', ascending=True)
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_degree | param_gamma | param_kernel | params | split0_test_score | ... | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
486 | 0.001242 | 0.000037 | 0.000400 | 0.000011 | 2.15443 | 3 | 0.00464159 | rbf | {'C': 2.154434690031882, 'degree': 3, 'gamma':... | 0.9375 | ... | 0.903061 | 0.028045 | 1 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
506 | 0.001252 | 0.000034 | 0.000397 | 0.000007 | 2.15443 | 5 | 0.00464159 | rbf | {'C': 2.154434690031882, 'degree': 5, 'gamma':... | 0.9375 | ... | 0.903061 | 0.028045 | 1 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
526 | 0.001246 | 0.000037 | 0.000397 | 0.000006 | 2.15443 | 7 | 0.00464159 | rbf | {'C': 2.154434690031882, 'degree': 7, 'gamma':... | 0.9375 | ... | 0.903061 | 0.028045 | 1 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
546 | 0.001248 | 0.000037 | 0.000395 | 0.000007 | 2.15443 | 9 | 0.00464159 | rbf | {'C': 2.154434690031882, 'degree': 9, 'gamma':... | 0.9375 | ... | 0.903061 | 0.028045 | 1 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
330 | 0.002271 | 0.000100 | 0.000599 | 0.000010 | 0.0359381 | 3 | 0.278256 | rbf | {'C': 0.03593813663804626, 'degree': 3, 'gamma... | 0.9500 | ... | 0.900510 | 0.033323 | 5 | 0.900641 | 0.910828 | 0.917197 | 0.910828 | 0.914013 | 0.910701 | 0.005557 |
408 | 0.001312 | 0.000038 | 0.000409 | 0.000010 | 0.278256 | 3 | 0.0359381 | rbf | {'C': 0.2782559402207126, 'degree': 3, 'gamma'... | 0.9500 | ... | 0.900510 | 0.037041 | 5 | 0.913462 | 0.907643 | 0.920382 | 0.910828 | 0.914013 | 0.913266 | 0.004214 |
428 | 0.001307 | 0.000042 | 0.000407 | 0.000011 | 0.278256 | 5 | 0.0359381 | rbf | {'C': 0.2782559402207126, 'degree': 5, 'gamma'... | 0.9500 | ... | 0.900510 | 0.037041 | 5 | 0.913462 | 0.907643 | 0.920382 | 0.910828 | 0.914013 | 0.913266 | 0.004214 |
448 | 0.001330 | 0.000028 | 0.000407 | 0.000010 | 0.278256 | 7 | 0.0359381 | rbf | {'C': 0.2782559402207126, 'degree': 7, 'gamma'... | 0.9500 | ... | 0.900510 | 0.037041 | 5 | 0.913462 | 0.907643 | 0.920382 | 0.910828 | 0.914013 | 0.913266 | 0.004214 |
390 | 0.002348 | 0.000127 | 0.000616 | 0.000029 | 0.0359381 | 9 | 0.278256 | rbf | {'C': 0.03593813663804626, 'degree': 9, 'gamma... | 0.9500 | ... | 0.900510 | 0.033323 | 5 | 0.900641 | 0.910828 | 0.917197 | 0.910828 | 0.914013 | 0.910701 | 0.005557 |
468 | 0.001374 | 0.000165 | 0.000407 | 0.000010 | 0.278256 | 9 | 0.0359381 | rbf | {'C': 0.2782559402207126, 'degree': 9, 'gamma'... | 0.9500 | ... | 0.900510 | 0.037041 | 5 | 0.913462 | 0.907643 | 0.920382 | 0.910828 | 0.914013 | 0.913266 | 0.004214 |
350 | 0.002316 | 0.000052 | 0.000604 | 0.000010 | 0.0359381 | 5 | 0.278256 | rbf | {'C': 0.03593813663804626, 'degree': 5, 'gamma... | 0.9500 | ... | 0.900510 | 0.033323 | 5 | 0.900641 | 0.910828 | 0.917197 | 0.910828 | 0.914013 | 0.910701 | 0.005557 |
370 | 0.002351 | 0.000176 | 0.000696 | 0.000184 | 0.0359381 | 7 | 0.278256 | rbf | {'C': 0.03593813663804626, 'degree': 7, 'gamma... | 0.9500 | ... | 0.900510 | 0.033323 | 5 | 0.900641 | 0.910828 | 0.917197 | 0.910828 | 0.914013 | 0.910701 | 0.005557 |
702 | 0.001422 | 0.000192 | 0.000449 | 0.000062 | 129.155 | 9 | 7.74264e-05 | rbf | {'C': 129.15496650148827, 'degree': 9, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
624 | 0.001270 | 0.000029 | 0.000403 | 0.000007 | 16.681 | 9 | 0.000599484 | rbf | {'C': 16.681005372000556, 'degree': 9, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
642 | 0.001263 | 0.000030 | 0.000397 | 0.000008 | 129.155 | 3 | 7.74264e-05 | rbf | {'C': 129.15496650148827, 'degree': 3, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
604 | 0.001265 | 0.000031 | 0.000395 | 0.000007 | 16.681 | 7 | 0.000599484 | rbf | {'C': 16.681005372000556, 'degree': 7, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
760 | 0.001263 | 0.000026 | 0.000398 | 0.000011 | 1000 | 7 | 1e-05 | rbf | {'C': 1000.0, 'degree': 7, 'gamma': 1e-05, 'ke... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
740 | 0.001297 | 0.000044 | 0.000407 | 0.000015 | 1000 | 5 | 1e-05 | rbf | {'C': 1000.0, 'degree': 5, 'gamma': 1e-05, 'ke... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
682 | 0.001276 | 0.000050 | 0.000399 | 0.000009 | 129.155 | 7 | 7.74264e-05 | rbf | {'C': 129.15496650148827, 'degree': 7, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
720 | 0.001266 | 0.000035 | 0.000408 | 0.000012 | 1000 | 3 | 1e-05 | rbf | {'C': 1000.0, 'degree': 3, 'gamma': 1e-05, 'ke... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
780 | 0.001257 | 0.000020 | 0.000394 | 0.000007 | 1000 | 9 | 1e-05 | rbf | {'C': 1000.0, 'degree': 9, 'gamma': 1e-05, 'ke... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
584 | 0.001263 | 0.000024 | 0.000396 | 0.000010 | 16.681 | 5 | 0.000599484 | rbf | {'C': 16.681005372000556, 'degree': 5, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
662 | 0.001275 | 0.000044 | 0.000401 | 0.000015 | 129.155 | 5 | 7.74264e-05 | rbf | {'C': 129.15496650148827, 'degree': 5, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.913462 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.914539 | 0.005496 |
564 | 0.001268 | 0.000027 | 0.000432 | 0.000061 | 16.681 | 3 | 0.000599484 | rbf | {'C': 16.681005372000556, 'degree': 3, 'gamma'... | 0.9125 | ... | 0.897959 | 0.023165 | 13 | 0.916667 | 0.907643 | 0.923567 | 0.910828 | 0.917197 | 0.915180 | 0.005520 |
430 | 0.001396 | 0.000041 | 0.000411 | 0.000010 | 0.278256 | 5 | 0.278256 | rbf | {'C': 0.2782559402207126, 'degree': 5, 'gamma'... | 0.9000 | ... | 0.895408 | 0.022086 | 25 | 0.919872 | 0.920382 | 0.923567 | 0.914013 | 0.914013 | 0.918369 | 0.003776 |
410 | 0.001382 | 0.000046 | 0.000408 | 0.000011 | 0.278256 | 3 | 0.278256 | rbf | {'C': 0.2782559402207126, 'degree': 3, 'gamma'... | 0.9000 | ... | 0.895408 | 0.022086 | 25 | 0.919872 | 0.920382 | 0.923567 | 0.914013 | 0.914013 | 0.918369 | 0.003776 |
450 | 0.001389 | 0.000064 | 0.000406 | 0.000008 | 0.278256 | 7 | 0.278256 | rbf | {'C': 0.2782559402207126, 'degree': 7, 'gamma'... | 0.9000 | ... | 0.895408 | 0.022086 | 25 | 0.919872 | 0.920382 | 0.923567 | 0.914013 | 0.914013 | 0.918369 | 0.003776 |
470 | 0.001390 | 0.000047 | 0.000407 | 0.000005 | 0.278256 | 9 | 0.278256 | rbf | {'C': 0.2782559402207126, 'degree': 9, 'gamma'... | 0.9000 | ... | 0.895408 | 0.022086 | 25 | 0.919872 | 0.920382 | 0.923567 | 0.914013 | 0.914013 | 0.918369 | 0.003776 |
250 | 0.002605 | 0.000087 | 0.000671 | 0.000012 | 0.00464159 | 3 | 0.278256 | rbf | {'C': 0.004641588833612777, 'degree': 3, 'gamm... | 0.9375 | ... | 0.887755 | 0.049905 | 29 | 0.894231 | 0.907643 | 0.914013 | 0.898089 | 0.910828 | 0.904961 | 0.007562 |
190 | 0.002529 | 0.000011 | 0.000657 | 0.000004 | 0.000599484 | 5 | 0.278256 | rbf | {'C': 0.0005994842503189409, 'degree': 5, 'gam... | 0.9375 | ... | 0.887755 | 0.049905 | 29 | 0.894231 | 0.907643 | 0.914013 | 0.898089 | 0.910828 | 0.904961 | 0.007562 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
76 | 0.002456 | 0.000026 | 0.000636 | 0.000013 | 1e-05 | 9 | 129.155 | rbf | {'C': 1e-05, 'degree': 9, 'gamma': 129.1549665... | 0.5000 | ... | 0.505102 | 0.006275 | 721 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
716 | 0.003039 | 0.000187 | 0.000644 | 0.000008 | 129.155 | 9 | 129.155 | rbf | {'C': 129.15496650148827, 'degree': 9, 'gamma'... | 0.5250 | ... | 0.505102 | 0.012920 | 721 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
136 | 0.002508 | 0.000059 | 0.000649 | 0.000012 | 7.74264e-05 | 7 | 129.155 | rbf | {'C': 7.742636826811278e-05, 'degree': 7, 'gam... | 0.5000 | ... | 0.505102 | 0.006275 | 721 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
416 | 0.002479 | 0.000040 | 0.000642 | 0.000009 | 0.278256 | 3 | 129.155 | rbf | {'C': 0.2782559402207126, 'degree': 3, 'gamma'... | 0.5000 | ... | 0.505102 | 0.006275 | 721 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
638 | 0.001888 | 0.000037 | 0.000503 | 0.000008 | 16.681 | 9 | 1000 | rbf | {'C': 16.681005372000556, 'degree': 9, 'gamma'... | 0.5000 | ... | 0.505102 | 0.006275 | 721 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
696 | 0.003163 | 0.000313 | 0.000679 | 0.000058 | 129.155 | 7 | 129.155 | rbf | {'C': 129.15496650148827, 'degree': 7, 'gamma'... | 0.5250 | ... | 0.505102 | 0.012920 | 721 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
258 | 0.001855 | 0.000016 | 0.000499 | 0.000003 | 0.00464159 | 3 | 1000 | rbf | {'C': 0.004641588833612777, 'degree': 3, 'gamm... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
398 | 0.002007 | 0.000159 | 0.000569 | 0.000112 | 0.0359381 | 9 | 1000 | rbf | {'C': 0.03593813663804626, 'degree': 9, 'gamma... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
278 | 0.001857 | 0.000014 | 0.000500 | 0.000003 | 0.00464159 | 5 | 1000 | rbf | {'C': 0.004641588833612777, 'degree': 5, 'gamm... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
298 | 0.001931 | 0.000082 | 0.000525 | 0.000026 | 0.00464159 | 7 | 1000 | rbf | {'C': 0.004641588833612777, 'degree': 7, 'gamm... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
358 | 0.001850 | 0.000014 | 0.000507 | 0.000010 | 0.0359381 | 5 | 1000 | rbf | {'C': 0.03593813663804626, 'degree': 5, 'gamma... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
478 | 0.001872 | 0.000028 | 0.000504 | 0.000008 | 0.278256 | 9 | 1000 | rbf | {'C': 0.2782559402207126, 'degree': 9, 'gamma'... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
458 | 0.001870 | 0.000012 | 0.000505 | 0.000009 | 0.278256 | 7 | 1000 | rbf | {'C': 0.2782559402207126, 'degree': 7, 'gamma'... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
78 | 0.001847 | 0.000011 | 0.000500 | 0.000003 | 1e-05 | 9 | 1000 | rbf | {'C': 1e-05, 'degree': 9, 'gamma': 1000.0, 'ke... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
418 | 0.001872 | 0.000025 | 0.000507 | 0.000014 | 0.278256 | 3 | 1000 | rbf | {'C': 0.2782559402207126, 'degree': 3, 'gamma'... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
98 | 0.001855 | 0.000032 | 0.000502 | 0.000006 | 7.74264e-05 | 3 | 1000 | rbf | {'C': 7.742636826811278e-05, 'degree': 3, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
218 | 0.001866 | 0.000018 | 0.000517 | 0.000030 | 0.000599484 | 7 | 1000 | rbf | {'C': 0.0005994842503189409, 'degree': 7, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
118 | 0.001847 | 0.000008 | 0.000501 | 0.000007 | 7.74264e-05 | 5 | 1000 | rbf | {'C': 7.742636826811278e-05, 'degree': 5, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
18 | 0.002360 | 0.000248 | 0.000746 | 0.000191 | 1e-05 | 3 | 1000 | rbf | {'C': 1e-05, 'degree': 3, 'gamma': 1000.0, 'ke... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
438 | 0.001889 | 0.000029 | 0.000511 | 0.000015 | 0.278256 | 5 | 1000 | rbf | {'C': 0.2782559402207126, 'degree': 5, 'gamma'... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
338 | 0.002340 | 0.000443 | 0.000928 | 0.000450 | 0.0359381 | 3 | 1000 | rbf | {'C': 0.03593813663804626, 'degree': 3, 'gamma... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
318 | 0.001853 | 0.000016 | 0.000504 | 0.000009 | 0.00464159 | 9 | 1000 | rbf | {'C': 0.004641588833612777, 'degree': 9, 'gamm... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
378 | 0.001885 | 0.000015 | 0.000503 | 0.000005 | 0.0359381 | 7 | 1000 | rbf | {'C': 0.03593813663804626, 'degree': 7, 'gamma... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
38 | 0.001926 | 0.000097 | 0.000523 | 0.000023 | 1e-05 | 5 | 1000 | rbf | {'C': 1e-05, 'degree': 5, 'gamma': 1000.0, 'ke... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
58 | 0.001898 | 0.000036 | 0.000534 | 0.000045 | 1e-05 | 7 | 1000 | rbf | {'C': 1e-05, 'degree': 7, 'gamma': 1000.0, 'ke... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
178 | 0.001868 | 0.000021 | 0.000510 | 0.000010 | 0.000599484 | 3 | 1000 | rbf | {'C': 0.0005994842503189409, 'degree': 3, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
138 | 0.001881 | 0.000021 | 0.000504 | 0.000011 | 7.74264e-05 | 7 | 1000 | rbf | {'C': 7.742636826811278e-05, 'degree': 7, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
158 | 0.001846 | 0.000012 | 0.000504 | 0.000011 | 7.74264e-05 | 9 | 1000 | rbf | {'C': 7.742636826811278e-05, 'degree': 9, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
238 | 0.001850 | 0.000004 | 0.000502 | 0.000008 | 0.000599484 | 9 | 1000 | rbf | {'C': 0.0005994842503189409, 'degree': 9, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
198 | 0.001851 | 0.000013 | 0.000510 | 0.000019 | 0.000599484 | 5 | 1000 | rbf | {'C': 0.0005994842503189409, 'degree': 5, 'gam... | 0.5000 | ... | 0.500000 | 0.000000 | 777 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
800 rows × 24 columns
Comment:
The above table lists the models tested by grid search, ranked by cross-validation test scores.
A radial kernel performs best, with a mean CV accuracy of 0.903 which is the same as that achieved by the linear kernel (to 3 d.p.)
oj_df = pd.read_csv('./data/OJ.csv', index_col=0)
oj_df.head()
Purchase | WeekofPurchase | StoreID | PriceCH | PriceMM | DiscCH | DiscMM | SpecialCH | SpecialMM | LoyalCH | SalePriceMM | SalePriceCH | PriceDiff | Store7 | PctDiscMM | PctDiscCH | ListPriceDiff | STORE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | CH | 237 | 1 | 1.75 | 1.99 | 0.00 | 0.0 | 0 | 0 | 0.500000 | 1.99 | 1.75 | 0.24 | No | 0.000000 | 0.000000 | 0.24 | 1 |
2 | CH | 239 | 1 | 1.75 | 1.99 | 0.00 | 0.3 | 0 | 1 | 0.600000 | 1.69 | 1.75 | -0.06 | No | 0.150754 | 0.000000 | 0.24 | 1 |
3 | CH | 245 | 1 | 1.86 | 2.09 | 0.17 | 0.0 | 0 | 0 | 0.680000 | 2.09 | 1.69 | 0.40 | No | 0.000000 | 0.091398 | 0.23 | 1 |
4 | MM | 227 | 1 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.400000 | 1.69 | 1.69 | 0.00 | No | 0.000000 | 0.000000 | 0.00 | 1 |
5 | CH | 228 | 7 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.956535 | 1.69 | 1.69 | 0.00 | Yes | 0.000000 | 0.000000 | 0.00 | 0 |
'WeekofPurchase+StoreID+PriceCH+PriceMM+DiscCH+DiscMM+SpecialCH+SpecialMM+LoyalCH+SalePriceMM+SalePriceCH+PriceDiff+Store7+PctDiscMM+PctDiscCH+ListPriceDiff+STORE'
(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
preds = "+".join(oj_df.columns.drop("Purchase"))
f = f'Purchase ~ {preds}'
y, X = pt.dmatrices(f, oj_df)
y = y[:, 0]
# scale data
X = preprocessing.scale(X)
# Split training test sets
test_size = len(y)-800
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)
(b) Fit a support vector classifier to the training data using cost=0.01, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics, and describe the results obtained.
model = svm.LinearSVC(C=0.1, random_state=0).fit(X_train, y_train)
df = pd.DataFrame({'feature': oj_df.columns.drop('Purchase'),
'coef': np.ravel(model.coef_)[1:]})
plt.figure(figsize=(10,6))
sns.barplot(x='coef', y='feature', data=df);
(c) What are the training and test error rates?
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')
train accuracy: 0.840 test accuracy : 0.830
(d) Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10.
C_range = np.logspace(-2, 1, 10)
param_grid = dict(C=C_range)
model_grid = GridSearchCV(svm.LinearSVC(), param_grid=param_grid, cv=5,
scoring='accuracy', return_train_score=True)
model_grid.fit(X, y)
GridSearchCV(cv=5, error_score='raise', estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0), fit_params=None, iid=True, n_jobs=1, param_grid={'C': array([ 0.01 , 0.02154, 0.04642, 0.1 , 0.21544, 0.46416, 1. , 2.15443, 4.64159, 10. ])}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring='accuracy', verbose=0)
(e) Compute the training and test error rates using this new value for cost.
accuracy_train = model_grid.score(X_train, y_train)
accuracy_test = model_grid.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')
train accuracy: 0.835 test accuracy : 0.841
(f) Repeat parts (b) through (e) using a support vector machine with a radial kernel. Use the default value for gamma.
C_range = np.logspace(-2, 1, 10)
param_grid = dict(C=C_range)
model_grid = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=param_grid, cv=5,
scoring='accuracy', return_train_score=True)
model_grid.fit(X, y)
accuracy_train = model_grid.score(X_train, y_train)
accuracy_test = model_grid.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')
train accuracy: 0.849 test accuracy : 0.848
(g) Repeat parts (b) through (e) using a support vector machine with a polynomial kernel. Set degree=2.
C_range = np.logspace(-2, 1, 10)
param_grid = dict(C=C_range)
model_grid = GridSearchCV(svm.SVC(kernel='poly', degree=2), param_grid=param_grid, cv=5,
scoring='accuracy', return_train_score=True)
model_grid.fit(X, y)
accuracy_train = model_grid.score(X_train, y_train)
accuracy_test = model_grid.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')
train accuracy: 0.804 test accuracy : 0.767
(h) Overall, which approach seems to give the best results on this data?
Support vector machine with radial kernel and cost chosen by grid search.