author: weiyaszcfweiya@gmail.com
date: Dec. 28, 2017
This notebook is to implement the Section 6 in Chapter 11 of ESL
where $X^T=(X_1, X_2)$, and $X_j\sim N(0,1)$. For the parameters, $a_1=(3,3),a_2=(3,-3)$. For the gaussian noise $\epsilon$, we choose its variance to satisfy $$ \frac{Var(f(X))}{Var(\epsilon)}=4 $$
import numpy as np
from scipy.special import expit
# set up
num_train = 100
num_test = 10000
# generate X
np.random.seed(123)
X_train = np.random.normal(0, 1, (num_train, 2))
X_test = np.random.normal(0, 1, (num_test, 2))
# parameters
a_1 = np.array([3, 3])
a_2 = np.array([3, -3])
epsi_var = np.var(X_train) / 4
epsi_train = np.random.normal(0, np.sqrt(epsi_var), num_train)
epsi_test = np.random.normal(0, np.sqrt(epsi_var), num_test)
# generate y
y_train = expit(X_train.dot(a_1)) + expit(X_train.dot(a_2)) + epsi_train
y_test = expit(X_test.dot(a_1)) + expit(X_test.dot(a_2)) + epsi_test
y_train = y_train.reshape(num_train, 1)
y_test = y_test.reshape(num_test, 1)
where $$ s_{mi}=\sigma'(\alpha_{0m}+\alpha_m^Tx_i)\sum\limits_{k=1}^K\beta_{km}\delta_{ki} $$ $$ \delta_{ki}=-2(y_{ik}-f_k(x_i))g'_k(\beta_{0k}+\beta_k^Tz_i) $$ and we have $$ \begin{align} \sigma'(v)&=\sigma(v)\cdot(1-\sigma(v))\\ g_k'(T)&=1 \end{align} $$ Note: for softmax function, we have $g_k'(T) = g_k(T)\cdot (1-g_k(T)))$
def train_model(nn_hdim, num_passes = 5000, print_loss = False, reg_lambda = 0.0):
# initialize
# np.random.seed(123)
alpha = np.random.randn(2, nn_hdim) / np.sqrt(2)
a = np.zeros((1, nn_hdim))
beta = np.random.randn(nn_hdim, 1) / np.sqrt(nn_hdim)
b = np.zeros((1, 1))
gamma = 0.0005
model = {}
# gradient descent
for i in xrange(0, num_passes):
# forward pass
z = expit(X_train.dot(alpha) + a)
T = z.dot(beta) + b
pred = T
# calculate the loss
if print_loss and i%100 == 0:
data_loss = np.sum(np.square(pred-y_train)) + reg_lambda/2 * (np.sum(np.square(alpha)) + np.sum(np.square(beta)))
print "Loss after iteration %i: %f" %(i, 1./num_train * data_loss)
# back propagation
delta = -2*(y_train-pred)
dbeta = (z.T).dot(delta)
db = np.sum(delta, axis=0, keepdims=True)
s = delta.dot(beta.T) * (z*(1-z))
dalpha = np.dot(X_train.T, s)
da = np.sum(s, axis=0)
# add regularization term
dbeta += reg_lambda * beta
dalpha += reg_lambda * alpha
# gradient descent
alpha += -gamma * dalpha
a += -gamma * da
beta += -gamma * dbeta
b += -gamma * db
# assign new parameters to the model
model = {'alpha': alpha, 'a': a, 'beta': beta, 'b':b}
return model
# test
def test_model(model):
alpha, a, beta, b = model['alpha'], model['a'], model['beta'], model['b']
# forward pass
z = expit(X_test.dot(alpha) + a)
T = z.dot(beta) + b
pred = T
pred_loss = 1./num_test * np.sum(np.square(pred-y_test))
return pred_loss
# vary the number of hidden units
num_cases = 10
num_repeat = 10
mat_loss = np.zeros((num_repeat, num_cases))
for i in xrange(0, 10):
for j in xrange(0, num_repeat):
model = train_model(i+1)
mat_loss[j, i] = test_model(model)
%matplotlib inline
import matplotlib.pyplot as plt
The Bayes rate for regression with squared error is the error variance.
plt.plot([x+1 for x in range(10)], mat_loss.mean(axis=0)/epsi_var)
plt.ylabel('test error (relative to the Bayes error)')
plt.xlabel('number of hidden units')
plt.title('sum of sigmoids')
plt.show()
# boxplot
plt.boxplot(mat_loss/epsi_var)
plt.ylabel('test error (relative to the Bayes error)')
plt.xlabel('number of hidden units')
plt.title('sum of sigmoids')
plt.show()
# weight decay
# vary the number of hidden units
mat_loss_reg = np.zeros((num_repeat, num_cases))
for i in xrange(0, 10):
for j in xrange(0, num_repeat):
model = train_model(i+1, reg_lambda=0.1)
mat_loss_reg[j, i] = test_model(model)
# boxplot
plt.boxplot(mat_loss_reg/epsi_var)
plt.ylabel('test error (relative to the Bayes error)')
plt.xlabel('number of hidden units')
plt.title('sum of sigmoids (weight decay = 0.1)')
plt.show()
# vary the rate of weight decay
seq_lambda = np.arange(0.0, 0.2, 0.02)
mat_loss_vary_lambda = np.zeros((num_repeat, len(seq_lambda)))
for i in xrange(0, len(seq_lambda)):
for j in xrange(0, num_repeat):
model = train_model(10, reg_lambda=seq_lambda[i])
mat_loss_vary_lambda[j, i] = test_model(model)
# boxplot
plt.boxplot(mat_loss_vary_lambda/epsi_var)
plt.ylabel('test error (relative to the Bayes error)')
plt.xlabel('weight decay rate')
plt.xticks([x+1 for x in range(10)], seq_lambda)
plt.title('sum of sigmoids (10 hidden units)')
plt.show()