Based on Bishop ch 3 and ch 4.
This is a generic linear model, and it currently has one dimensional output. So it's suited for one-dimensional regression or binary classification. It supports using custom basis functions and training with either gradient descent or the normal equation.
%pylab inline
class LinearModel(object):
"""
A generic linear regressor. Uses nonlinear basis functions, can fit with
either the normal equations or gradient descent
"""
def __init__(self, basisfunc=None):
"""
Instantiate a linear regressor. If you want to use a custom basis function,
specify it here. It should accept an array and output an array. The default
basis function is the identity function.
"""
self.w = array([])
self.basisfunc = basisfunc if basisfunc is not None else self.identity
def identity(self, x):
#identity basis function - for linear models in x
return x
def basismap(self, X):
#return X in the new basis (the design matrix)
Xn = zeros((X.shape[0], self.basisfunc(X[0,:]).shape[0]))
for i, xi in enumerate(X):
Xn[i,:] = self.basisfunc(xi)
return Xn
def fit_gd(self, X, y, itrs=100, learning_rate=0.1, regularization=0.1):
"""
fit using iterative gradient descent with least squares loss
itrs - iterations of gd
learning_rate - learning rate for updates
regularization - weight decay. Greated values -> more regularization
"""
#first get a new basis by using our basis func
Xn = self.basismap(X)
#initial weights
self.w = uniform(-0.1, 0.1, (Xn.shape[1],1))
#now optimize in this new space, using gradient descent
print 'initial loss:', self.loss(X,y)
for i in range(itrs):
grad = self.grad(Xn, y, regularization)
self.w = self.w - learning_rate*grad
print 'final loss:', self.loss(X,y)
def grad(self, X, y, reg):
"""
Returns the gradient of the loss function with respect to the weights.
Used in gradient descent training.
"""
return -mean((y - dot(X, self.w)) * X, axis=0).reshape(self.w.shape) + reg*self.w
def fit_normal_eqns(self, X, y, reg=1e-5):
"""
Solves for the weights using the normal equation.
"""
Xn = self.basismap(X)
#self.w = dot(pinv(Xn), y)
self.w = dot(dot(inv(eye(Xn.shape[1])*reg + dot(Xn.T, Xn)), Xn.T) , y)
def predict(self, X):
"""
Makes predictions on a matrix of (observations x features)
"""
Xn = self.basismap(X)
return dot(Xn, self.w)
def loss(self, X, y):
#assumes that X is the data matrix (not the design matrix)
yh = self.predict(X)
return mean((yh-y)**2)
Populating the interactive namespace from numpy and matplotlib
Let's test it out. First, let's make some basis functions:
def fourier_basis(x):
#use sine waves with different amplitudes
sins = hstack(tuple(sin(pi*n*x)) for n in arange(0,1,0.1))
coss = hstack(tuple(cos(pi*n*x)) for n in arange(0,1,0.1))
return hstack((sins, coss))
def linear_basis(x): #includes a bias!
return hstack((1,x))
def polynomial_basis(x): #degree 10
return hstack(tuple(x**i for i in range(10)))
def sigmoid_basis(x): #offset sigmoids
return hstack(tuple((1+exp(-x-mu))**-1) for mu in arange(-9,9,0.5))
def heavyside_basis(x): #offset heavysides
return hstack(tuple(0.5*(sign(x-mu)+1)) for mu in arange(-9,9,0.5))
Now we'll generate some data.
#generate some data
X = arange(-8, 8, 0.1).reshape((-1,1))
y = sin(X) + randn(X.shape[0],X.shape[1])*0.3
scatter(X, y)
<matplotlib.collections.PathCollection at 0x13cf3588>
Linear basis, without a bias.
model = LinearModel()
model.fit_normal_eqns(X, y, reg=0.1)
Xn = arange(-8, 8, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Linear basis, no bias')
show()
Linear basis, with a bias. Since the data are centered, this won't be any different then the above. We'll see an example later where the bias is important.
model = LinearModel(basisfunc=linear_basis)
model.fit_normal_eqns(X, y, 0.1)
Xn = arange(-8, 8, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Linear Basis, with bias')
show()
We can obviously see that the data are from a sine wave. Using a sinusoidal basis...
model = LinearModel(basisfunc=fourier_basis)
model.fit_normal_eqns(X, y, 0.1)
Xn = arange(-8, 8, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Fourier basis')
show()
Sine is an analytic function. We should be able to approximate it with a polynomial, right?
model = LinearModel(basisfunc=polynomial_basis)
model.fit_normal_eqns(X, y, 0.1)
Xn = arange(-8, 8, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Polynomial basis')
show()
Neural nets use sigmoids. Our sigmoid basis will be kind of like a neural net, but we won't modify the basis as we train.
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.1)
Xn = arange(-8, 8, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Sigmoidal basis')
show()
Suppose we know the function is locally linear. We can use heavyside step functions to approximate it.
model = LinearModel(basisfunc=heavyside_basis)
model.fit_normal_eqns(X, y, 0.1)
Xn = arange(-8, 8, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Heavyside basis')
show()
Interesting. Let's try some different data, making it non-centered so the bias becomes important.
X = arange(-10, 10, 0.5)
y = 0.8*abs(X)**0.3 + 2 + randn(X.shape[0])*0.1
scatter(X,y)
X = X.reshape((-1,1))
Linear basis, no bias
model = LinearModel()
model.fit_normal_eqns(X, y, reg=0.1)
Xn = arange(-10, 10, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Linear basis, no bias')
show()
The bias allows us to compensate for non-centered data. Let's see how the predictions change once we add it.
model = LinearModel(basisfunc=linear_basis)
model.fit_normal_eqns(X, y,)
Xn = arange(-10, 10, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Linear basis, with bias')
show()
Let's try a fourier basis.
model = LinearModel(basisfunc=fourier_basis)
model.fit_normal_eqns(X, y)
Xn = arange(-10, 10, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Fourier basis')
show()
Polynomial basis.
model = LinearModel(basisfunc=polynomial_basis)
model.fit_normal_eqns(X, y)
Xn = arange(-10, 10, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Polynomial basis')
show()
Sigmoid basis.
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.1)
Xn = arange(-10, 10, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Sigmoid basis')
show()
It loses accuracy over to the left because the basis functions stop at -8. I suspect the same problem will happen with the heavyside function
model = LinearModel(basisfunc=heavyside_basis)
model.fit_normal_eqns(X, y, 0.2)
Xn = arange(-10, 10, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
ylim([2,4])
title('Heavyside step function basis')
show()
We can also fit with gradient descent.
model = LinearModel(basisfunc=fourier_basis)
model.fit_gd(X, y, regularization=1e-5, itrs=100, learning_rate=0.1)
Xn = arange(-10, 10, 0.05).reshape((-1,1))
yh = model.predict(Xn)
scatter(X, y)
plot(Xn, yh)
title('Fourier basis, fit with gradient descent')
show()
initial loss: 10.60546158 final loss: 0.0112470387183
That was fun. Let's do classification.
We can use the same model for classification - just predict {0,1} instead of a real number.
Let's try it out
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,
n_repeated=0, n_classes=2, n_clusters_per_class=1)
scatter(X[:,0], X[:,1], c=['r' if yi == 1 else 'b' for yi in y])
y = y.reshape((-1,1))
def heavyside_basis(x):
return hstack(tuple(0.5*(sign(x-mu)+1)) for mu in arange(-5,5,0.3))
def linear_basis(x):
return hstack((1,x)).ravel()
def sigmoid_basis(x):
return hstack(tuple((1+exp(-x-mu))**-1) for mu in arange(-5,5,0.3))
def polynomial_basis(x):
return hstack(tuple(x**i for i in range(5)))
def rbf_basis(x):
return hstack(tuple(exp(-(x-mu)**2) for mu in arange(-5,5,0.3)))
#some nice plotting code
def plot_contour_scatter(model, title_text):
#sample from a lattice (for the nice visualization)
x1, x2 = meshgrid(arange(-5,5,0.1), arange(-5,5,0.1))
Xnew = vstack((x1.ravel(), x2.ravel())).T
Z = model.predict(Xnew).reshape((-1,1))
#plot - contour plot and scatter superimposed
contourf(arange(-5,5,0.1), arange(-5,5,0.1), Z[:,0].reshape(x1.shape),
cmap ='cool',levels=[-1e10,0.499,0.5, 1e10])
colorsToUse= ['r' if yi == 1 else 'b' for yi in y]
scatter(X[:,0],X[:,1], c=colorsToUse)
title(title_text)
show()
Now that we got the auxillary work taken care of, the fun part:
model = LinearModel(basisfunc=linear_basis)
model.fit_normal_eqns(X, y, 0.1)
#model.fit_gd(X, y, itrs=100, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Linear basis')
model = LinearModel(basisfunc=heavyside_basis)
#model.fit_normal_eqns(X, y, 0.0)
model.fit_gd(X, y, itrs=100, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Heavyside Basis')
initial loss: 0.320790210969 final loss: 0.0413932329954
Note - the heavyside design matrix is singular for this problem, so using the normal equations is futile.
model = LinearModel(basisfunc=polynomial_basis)
model.fit_normal_eqns(X, y, 0.1)
#model.fit_gd(X, y, itrs=100, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Polynomial Basis (degree 3)')
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.1)
#model.fit_gd(X, y, itrs=100, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Sigmoid Basis')
model = LinearModel(basisfunc=rbf_basis)
#model.fit_normal_eqns(X, y, .5)
model.fit_gd(X, y, itrs=1000, regularization=0.01, learning_rate=.1)
plot_contour_scatter(model, 'RBF Basis')
initial loss: 0.401877752627 final loss: 0.00210904171445
Now how about we modify the basis functions to take interaction between dimensions into account. In other words, instead of $\phi_i(x_j)$, make $\phi_i(x_0, ... , x_n)$.
I'll do this by first projecting x into a higher dimensional space of interactions (via a random matrix), and then doing the regular basis functions there.
A = uniform(-0.5, 0.5, (10,2))
def heavyside_basis(x):
xn = dot(A,x)
return hstack(tuple(0.5*(sign(xn-mu)+1)) for mu in arange(-5,5,0.5))
def linear_basis(x):
xn = dot(A,x)
return hstack((1,xn)).ravel()
def sigmoid_basis(x):
xn = dot(A,x)
return hstack(tuple((1+exp(-xn-mu))**-1) for mu in arange(-5,5,0.3))
def polynomial_basis(x):
xn = dot(A,x)
return hstack(tuple(xn**i for i in range(5)))
def rbf_basis(x):
xn = dot(A,x)
return hstack(tuple(exp(-(xn-mu)**2) for mu in arange(-5,5,0.3)))
I expect linear models to be the same. Composition of linear functions is linear.
model = LinearModel(basisfunc=linear_basis)
model.fit_normal_eqns(X, y, 0.1)
#model.fit_gd(X, y, itrs=100, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Linear basis')
model = LinearModel(basisfunc=heavyside_basis)
#model.fit_normal_eqns(X, y, 0.2)
model.fit_gd(X, y, itrs=100, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Heavyside basis')
initial loss: 1.38533954693 final loss: 0.0143174964811
model = LinearModel(basisfunc=sigmoid_basis)
#model.fit_normal_eqns(X, y, 0.1)
model.fit_gd(X, y, itrs=100, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Sigmoid basis')
initial loss: 0.328485404567 final loss: 0.0937578444874
model = LinearModel(basisfunc=polynomial_basis)
#model.fit_normal_eqns(X, y, 0.1)
model.fit_gd(X, y, itrs=1000, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'Polynomial basis')
initial loss: 0.458981318706 final loss: 0.0162158979183
model = LinearModel(basisfunc=rbf_basis)
#model.fit_normal_eqns(X, y, 0.1)
model.fit_gd(X, y, itrs=200, regularization=0.1, learning_rate=0.01)
plot_contour_scatter(model, 'RBF basis')
initial loss: 0.384524839697 final loss: 0.0102805578989
Let's compare how regularization affects the model. Let's make a classification problem:
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=20, n_features=2, n_informative=1, n_redundant=0,
n_repeated=0, n_classes=2, n_clusters_per_class=1)
scatter(X[:,0], X[:,1], c=['r' if yi == 1 else 'b' for yi in y])
<matplotlib.collections.PathCollection at 0x13d70f28>
Now we'll insidiously swap labels.
for i in range(5):
y[i] = 1-y[i]
scatter(X[:,0], X[:,1], c=['r' if yi == 1 else 'b' for yi in y])
y = y.reshape((-1,1))
And now let's run a classifier with no regularization
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.0)
plot_contour_scatter(model, 'Sigmoid basis, no regularization')
print 'size of weight vector:', norm(model.w)
size of weight vector: 35016673.5307
And again, with regularization
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.9)
plot_contour_scatter(model, 'Sigmoid basis, with regularization')
print 'size of weight vector', norm(model.w)
size of weight vector 0.675961545611
So we can see that the regulization pulls the size (l2 norm) of the weight vector to zero, which makes the decision boundary "less complicated". It can also be seen as putting a centered gaussian prior on the weight vector. Let's look at regularization in regression.
X = arange(-5, 5, 0.5).reshape((-1,1))
y = sin(X)
#now add some corruption
y[2] += 1
y[-3] -= 1
y[5] += 2
y[7] -= 0.2
y[15] += 1
y[10] -= 1
#plot it
scatter(X,y)
<matplotlib.collections.PathCollection at 0x14969d30>
Now let's fit two models, with and without regularization.
def sigmoid_basis(x):
return hstack(tuple((1+exp(-x-mu))**-1) for mu in arange(-7,7,0.5))
This model doesn't have regularization.
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.0)
yh = model.predict(arange(-7,7,0.1).reshape((-1,1)))
plot(arange(-7,7,0.1), yh)
scatter(X,y)
ylim([-4,2])
print 'size of weight vector', norm(model.w)
size of weight vector 2.52076671123e+13
This is obviously a horrible model - it's just hitting the training points. Even a tiny, nonnegative amount of regularization helps.
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.00001)
#model.fit_gd(X, y, itrs=100, regularization=0.001, learning_rate=0.1)
yh = model.predict(arange(-7,7,0.1).reshape((-1,1)))
plot(arange(-7,7,0.1), yh)
scatter(X,y)
ylim([-4,2])
print 'size of weight vector', norm(model.w)
size of weight vector 124.134729356
Better. Let's increase the regularization
model = LinearModel(basisfunc=sigmoid_basis)
model.fit_normal_eqns(X, y, 0.01)
#model.fit_gd(X, y, itrs=100, regularization=0.001, learning_rate=0.1)
yh = model.predict(arange(-7,7,0.1).reshape((-1,1)))
plot(arange(-7,7,0.1), yh)
scatter(X,y)
ylim([-4,2])
print 'size of weight vector', norm(model.w)
size of weight vector 8.07893778044
And now we can see how regularization is important for models, espescially when we have a lot of parameters (which my basis function mappings did).