## Programming Exercise 6 - Support Vector Machines¶

In [124]:
# %load ../../../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.svm import SVC

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)

#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

In [75]:
def plotData(X, y):
pos = (y == 1).ravel()
neg = (y == 0).ravel()

plt.scatter(X[pos,0], X[pos,1], s=60, c='k', marker='+', linewidths=1)
plt.scatter(X[neg,0], X[neg,1], s=60, c='y', marker='o', linewidths=1)

In [144]:
def plot_svc(svc, X, y, h=0.02, pad=0.25):
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.2)

plotData(X, y)
#plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=mpl.cm.Paired)
# Support vectors indicated in plot by vertical lines
sv = svc.support_vectors_
plt.scatter(sv[:,0], sv[:,1], c='k', marker='|', s=100, linewidths='1')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
print('Number of support vectors: ', svc.support_.size)


### Support Vector Machines¶

#### Example Dataset 1¶

In [6]:
data1 = loadmat('data/ex6data1.mat')
data1.keys()

Out[6]:
dict_keys(['y', '__version__', 'X', '__globals__', '__header__'])
In [67]:
y1 = data1['y']
X1 = data1['X']

print('X1:', X1.shape)
print('y1:', y1.shape)

X1: (51, 2)
y1: (51, 1)

In [72]:
plotData(X1,y1)

In [146]:
clf = SVC(C=1.0, kernel='linear')
clf.fit(X1, y1.ravel())
plot_svc(clf, X1, y1)

Number of support vectors:  12

In [145]:
clf.set_params(C=100)
clf.fit(X1, y1.ravel())
plot_svc(clf, X1, y1)

Number of support vectors:  3


### SVM with Gaussian Kernels¶

In [133]:
def gaussianKernel(x1, x2, sigma=2):
norm = (x1-x2).T.dot(x1-x2)
return(np.exp(-norm/(2*sigma**2)))

In [119]:
x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2

gaussianKernel(x1, x2, sigma)

Out[119]:
0.32465246735834974

#### Example Dataset 2¶

In [140]:
data2 = loadmat('data/ex6data2.mat')
data2.keys()

Out[140]:
dict_keys(['y', '__version__', 'X', '__globals__', '__header__'])
In [141]:
y2 = data2['y']
X2 = data2['X']

print('X2:', X2.shape)
print('y2:', y2.shape)

X2: (863, 2)
y2: (863, 1)

In [142]:
plotData(X2, y2)

In [166]:
clf2 = SVC(C=50, kernel='rbf', gamma=6)
clf2.fit(X2, y2.ravel())
plot_svc(clf2, X2, y2)

Number of support vectors:  254


#### Example Dataset 3¶

In [77]:
data3 = loadmat('data/ex6data3.mat')
data3.keys()

Out[77]:
dict_keys(['Xval', 'y', 'yval', '__version__', 'X', '__globals__', '__header__'])
In [79]:
y3 = data3['y']
X3 = data3['X']

print('X3:', X3.shape)
print('y3:', y3.shape)

X3: (211, 2)
y3: (211, 1)

In [80]:
plotData(X3, y3)

In [189]:
clf3 = SVC(C=1.0, kernel='poly', degree=3, gamma=10)
clf3.fit(X3, y3.ravel())
plot_svc(clf3, X3, y3)

Number of support vectors:  78


### Spam classification¶

In [171]:
data4 = pd.read_table('data/vocab.txt', header=None)
data4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1899 entries, 0 to 1898
Data columns (total 2 columns):
0    1899 non-null int64
1    1899 non-null object
dtypes: int64(1), object(1)
memory usage: 37.1+ KB

In [172]:
data4.head()

Out[172]:
   0      1
0  1     aa
1  2     ab
2  3   abil
3  4    abl
4  5  about