Support Vector Machines

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import KFold
%matplotlib inline

Linear SVM Classifier

We generate some data in 2 dimensions and make them a little separated.

Code for the plot with the hyperplane and support margins was largely copied from here.

In [8]:
X = np.random.randn(20, 2)
y = np.concatenate((np.ones(10)*-1, np.ones(10)))
X[y == 1] = X[y == 1] + 1
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], marker='o', color='red')
plt.scatter(X[y == -1][:, 0], X[y == -1][:, 1], marker='o', color='blue')
plt.xlabel("X[:,0]")
plt.ylabel("X[:,1]")
Out[8]:
<matplotlib.text.Text at 0x5bf0410>
In [11]:
# Fit the Support Vector Regressor with a linear kernel
reg = SVR(C=10.0, kernel="linear")
reg.fit(X, y)

# plot the hyperplane on the scatterplot
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], marker='o', color='red')
plt.scatter(X[y == -1][:, 0], X[y == -1][:, 1], marker='o', color='blue')
plt.xlabel("X[:,0]")
plt.ylabel("X[:,1]")
# hyperplane
xs = range(int(np.min(X[:,0]))-1, int(np.max(X[:,0]))+1)
m = -reg.coef_[0][0] / reg.coef_[0][1]
c = reg.intercept_ / reg.coef_[0][1]
ys = [m * x + c for x in xs]
plt.plot(xs, ys, color='black', linewidth=2.5)
# support margins
sup = reg.support_vectors_[0]
ys_down = [m * x + (sup[1] - m * sup[0]) for x in xs]
sup = reg.support_vectors_[-1]
ys_up = [m * x + (sup[1] - m * sup[0]) for x in xs]
plt.plot(xs, ys_down, 'k--')
plt.plot(xs, ys_up, 'k--')
Out[11]:
[<matplotlib.lines.Line2D at 0x66b8d90>]
In [12]:
ypred = reg.predict(X)
mean_squared_error(y, ypred)
Out[12]:
0.48715370910430711
In [13]:
# We use cross-validation to find a good value for C
cs = range(1, 20)
mses = []
for c in cs:
    kfold = KFold(X.shape[0], n_folds=10)
    kmses = []
    for train, test in kfold:
        Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
        reg = SVR(C=c, kernel="linear")
        reg.fit(Xtrain, ytrain)
        ypred = reg.predict(Xtest)
        kmses.append(mean_squared_error(ytest, ypred))
    mses.append(np.mean(kmses))
plt.plot(cs, mses, linewidth=2.5)
plt.xlabel("C")
plt.ylabel("MSE")
Out[13]:
<matplotlib.text.Text at 0x6415650>

Nonlinear SVM

In [6]:
# Data comes from http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/ESL.mixture.rda
# load(url("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/ESL.mixture.rda"))
# dat = data.frame(y=factor(y), x)
# write.csv(dat, "ESLMixture.csv", row.names=FALSE)
eslmix_df = pd.read_csv("../data/ESLMixture.csv")
eslmix_df.head()
Out[6]:
y X1 X2
0 0 2.526093 0.321050
1 0 0.366954 0.031462
2 0 0.768219 0.717486
3 0 0.693436 0.777194
4 0 -0.019837 0.867254

5 rows × 3 columns

In [7]:
# Plot the data to visualize the relationship
x1 = eslmix_df[eslmix_df["y"] == 0]["X1"]
x2 = eslmix_df[eslmix_df["y"] == 0]["X2"]
plt.scatter(x1, x2, marker='o', color='red')
x1 = eslmix_df[eslmix_df["y"] == 1]["X1"]
x2 = eslmix_df[eslmix_df["y"] == 1]["X2"]
plt.scatter(x1, x2, marker='o', color='blue')
plt.xlabel("X1")
plt.ylabel("X1")
Out[7]:
<matplotlib.text.Text at 0x55c2e10>
In [8]:
# Fit a non-linear SVM using a radial kernel
X = eslmix_df[["X1", "X2"]].values
y = eslmix_df["y"]
reg = SVR(C=5.0, kernel="rbf")
reg.fit(X, y)

# plot the hyperplane on the scatterplot
xx, yy = np.meshgrid(np.linspace(-3, 5, 50), np.linspace(-2, 3, 50))
Z = reg.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, levels=[0], linewidths=2, linetypes="--")

x1 = eslmix_df[eslmix_df["y"] == 0]["X1"]
x2 = eslmix_df[eslmix_df["y"] == 0]["X2"]
plt.scatter(x1, x2, marker='o', color='red')
x1 = eslmix_df[eslmix_df["y"] == 1]["X1"]
x2 = eslmix_df[eslmix_df["y"] == 1]["X2"]
plt.scatter(x1, x2, marker='o', color='blue')
plt.xlabel("X1")
plt.ylabel("X1")
Out[8]:
<matplotlib.text.Text at 0x53e8ed0>

The boundaries of the hyperplane generated by the Support Vector machine are non-linear and approximate the boundaries of the data it models (better than a linear hyperplane).

Code for the plot above was copied from here.