%matplotlib inline
import gluonbook as gb
from mxnet import autograd, gluon, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
Generating Data Sets
maxdegree = 20 # maximum degree of the polynomial
n_train, n_test = 100, 1000 # training and test data set sizes
true_w = nd.zeros(maxdegree) # allocate lots of empty space
true_w[0:4] = nd.array([5, 1.2, -3.4, 5.6])
print("true_w: {0}".format(true_w))
features = nd.random.normal(shape=(n_train + n_test, 1))
features = nd.random.shuffle(features)
print("\nfeatures: {0}".format(features))
print("\nnd.arange(maxdegree).reshape((1, -1)): {0}".format(
nd.arange(maxdegree).reshape((1, -1)))
)
print("\nnd.gamma(nd.arange(maxdegree) + 1).reshape((1, -1)): {0}".format(
nd.gamma(nd.arange(maxdegree) + 1).reshape((1, -1)))
)
poly_features = nd.power(features, nd.arange(maxdegree).reshape((1, -1)))
poly_features = poly_features / (nd.gamma(nd.arange(maxdegree) + 1).reshape((1, -1)))
print("\npoly_features: {0}".format(poly_features))
labels = nd.dot(poly_features, true_w)
labels += nd.random.normal(scale=0.1, shape=labels.shape)
print("\nlabels: {0}".format(labels))
true_w: [ 5. 1.2 -3.4 5.6 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] <NDArray 20 @cpu(0)> features: [[-2.3737078 ] [ 0.70149934] [ 0.15527226] ... [ 1.3379871 ] [ 0.19880056] [ 0.25500342]] <NDArray 1100x1 @cpu(0)> nd.arange(maxdegree).reshape((1, -1)): [[ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16. 17. 18. 19.]] <NDArray 1x20 @cpu(0)> nd.gamma(nd.arange(maxdegree) + 1).reshape((1, -1)): [[1.00000000e+00 1.00000000e+00 2.00000000e+00 6.00000000e+00 2.40000000e+01 1.20000000e+02 7.20000000e+02 5.04000000e+03 4.03200000e+04 3.62880000e+05 3.62880000e+06 3.99168000e+07 4.79001600e+08 6.22702080e+09 8.71782892e+10 1.30767441e+12 2.09227906e+13 3.55687415e+14 6.40237353e+15 1.21645105e+17]] <NDArray 1x20 @cpu(0)> poly_features: [[ 1.0000000e+00 -2.3737078e+00 2.8172443e+00 ... -6.7794548e-09 8.9402469e-10 -1.1169228e-10] [ 1.0000000e+00 7.0149934e-01 2.4605067e-01 ... 6.7825824e-18 2.6433205e-19 9.7594081e-21] [ 1.0000000e+00 1.5527226e-01 1.2054738e-02 ... 4.9834187e-29 4.2988148e-31 3.5130879e-33] ... [ 1.0000000e+00 1.3379871e+00 8.9510471e-01 ... 3.9683873e-13 2.9498061e-14 2.0772642e-15] [ 1.0000000e+00 1.9880056e-01 1.9760832e-02 ... 3.3268321e-27 3.6743117e-29 3.8445011e-31] [ 1.0000000e+00 2.5500342e-01 3.2513373e-02 ... 2.2919949e-25 3.2470366e-27 4.3579230e-29]] <NDArray 1100x20 @cpu(0)> labels: [-19.888744 5.3498735 4.951168 ... 5.8322945 5.327603 5.163274 ] <NDArray 1100 @cpu(0)>
def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None, legend=None, figsize=(3.5, 2.5)):
gb.set_figsize(figsize)
gb.plt.xlabel(x_label)
gb.plt.ylabel(y_label)
gb.plt.semilogy(x_vals, y_vals)
if x2_vals and y2_vals:
gb.plt.semilogy(x2_vals, y2_vals, linestyle=':')
gb.plt.legend(legend)
num_epochs, loss = 200, gloss.L2Loss()
def fit_and_plot(train_features, test_features, train_labels, test_labels):
net = nn.Sequential()
# Switch off the bias since we already catered for it in the polynomial features
net.add(nn.Dense(units=1, use_bias=False))
net.initialize()
batch_size = min(10, train_labels.shape[0])
train_iter = gdata.DataLoader(
gdata.ArrayDataset(train_features, train_labels),
batch_size,
shuffle=True
)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01})
train_ls, test_ls = [], []
for _ in range(num_epochs):
for X, y in train_iter:
with autograd.record():
l = loss(net(X), y)
l.backward()
trainer.step(batch_size)
train_ls.append(loss(net(train_features), train_labels).mean().asscalar())
test_ls.append(loss(net(test_features), test_labels).mean().asscalar())
print('final epoch:\ntrain loss {0}, test loss {1}'.format(train_ls[-1], test_ls[-1]))
semilogy(
range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
range(1, num_epochs + 1), test_ls, ['train', 'test']
)
print('weight: {0}'.format(net[0].weight.data().asnumpy()))
num_epochs = 1000
# Pick the first four dimensions, i.e. 1, x, x^2, x^3 from the polynomial features
fit_and_plot(
poly_features[:n_train, 0:4],
poly_features[n_train:, 0:4],
labels[:n_train],
labels[n_train:]
)
final epoch: train loss 0.005385274 test loss 0.005204492 weight: [[ 5.0193963 1.2109194 -3.4137754 5.598747 ]]
num_epochs = 1000
# Pick the first four dimensions, i.e. 1, x from the polynomial features
fit_and_plot(
poly_features[:n_train, 0:2],
poly_features[n_train:, 0:2],
labels[:n_train],
labels[n_train:]
)
final epoch: train loss 5.1508574 test loss 4.1340656 weight: [[3.404073 4.1500916]]
num_epochs = 1000
n_degree = 20 # degree of polynomials
fit_and_plot(
poly_features[:n_train, 0:n_degree],
poly_features[n_train:, 0:n_degree],
labels[:n_train],
labels[n_train:]
)
final epoch: train loss 0.011955421 test loss 0.021252891 weight: [[ 4.9227066 1.3885185 -2.9783912 4.874407 -1.0148908 1.3443892 -0.26418862 0.12346248 -0.03515212 -0.04212591 -0.03587594 -0.03025109 -0.03272099 0.00740935 0.04079974 -0.00665187 0.01734115 -0.00834598 -0.00843566 0.0215878 ]]
num_epochs, loss = 200, gloss.L2Loss()
def training_error_vs_model_complexity_and_plot():
max_degree = 20
final_train_ls, final_test_ls = [], []
for n_degree in range(2, max_degree):
train_features = poly_features[:n_train, 0:n_degree]
test_features = poly_features[n_train:, 0:n_degree]
train_labels = labels[:n_train]
test_labels = labels[n_train:]
net = nn.Sequential()
# Switch off the bias since we already catered for it in the polynomial features
net.add(nn.Dense(units=1, use_bias=False))
net.initialize()
batch_size = min(10, train_labels.shape[0])
train_iter = gdata.DataLoader(
gdata.ArrayDataset(train_features, train_labels),
batch_size,
shuffle=True
)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01})
for _ in range(num_epochs):
for X, y in train_iter:
with autograd.record():
l = loss(net(X), y)
l.backward()
trainer.step(batch_size)
final_train_ls.append(loss(net(train_features), train_labels).mean().asscalar())
final_test_ls.append(loss(net(test_features), test_labels).mean().asscalar())
print('n_degree: {0} - final epoch: train loss {1}, test loss {2}'.format(
n_degree,
final_train_ls[-1],
final_test_ls[-1]
))
semilogy(
range(2, max_degree), final_train_ls, 'n_degree', 'final_loss',
range(2, max_degree), final_test_ls, ['train', 'test']
)
training_error_vs_model_complexity_and_plot()
n_degree: 2 - final epoch: train loss 5.150863170623779, test loss 4.130029201507568 n_degree: 3 - final epoch: train loss 1.4600944519042969, test loss 1.9892637729644775 n_degree: 4 - final epoch: train loss 0.02632659114897251, test loss 0.035089656710624695 n_degree: 5 - final epoch: train loss 0.03756941854953766, test loss 0.14369729161262512 n_degree: 6 - final epoch: train loss 0.03391999006271362, test loss 0.09279215335845947 n_degree: 7 - final epoch: train loss 0.03659401088953018, test loss 0.1075616180896759 n_degree: 8 - final epoch: train loss 0.03495902940630913, test loss 0.09917280077934265 n_degree: 9 - final epoch: train loss 0.03554520383477211, test loss 0.10370409488677979 n_degree: 10 - final epoch: train loss 0.036453355103731155, test loss 0.10658491402864456 n_degree: 11 - final epoch: train loss 0.03632418066263199, test loss 0.10707336664199829 n_degree: 12 - final epoch: train loss 0.03484361991286278, test loss 0.10083470493555069 n_degree: 13 - final epoch: train loss 0.03716722130775452, test loss 0.10853695124387741 n_degree: 14 - final epoch: train loss 0.03467931970953941, test loss 0.10067345947027206 n_degree: 15 - final epoch: train loss 0.035094935446977615, test loss 0.09937365353107452 n_degree: 16 - final epoch: train loss 0.035065729171037674, test loss 0.10172484815120697 n_degree: 17 - final epoch: train loss 0.03583936020731926, test loss 0.10514272004365921 n_degree: 18 - final epoch: train loss 0.03587636351585388, test loss 0.10423813760280609 n_degree: 19 - final epoch: train loss 0.0349552184343338, test loss 0.10139323025941849
num_epochs, loss = 200, gloss.L2Loss()
def training_error_vs_amount_data_and_plot():
max_n_train = 1000
final_train_ls, final_test_ls = [], []
for n_train in range(100, max_n_train + 1, 100):
train_features = poly_features[:n_train, 0:4]
test_features = poly_features[n_train:, 0:4]
train_labels = labels[:n_train]
test_labels = labels[n_train:]
net = nn.Sequential()
# Switch off the bias since we already catered for it in the polynomial features
net.add(nn.Dense(units=1, use_bias=False))
net.initialize()
batch_size = min(10, train_labels.shape[0])
train_iter = gdata.DataLoader(
gdata.ArrayDataset(train_features, train_labels),
batch_size,
shuffle=True
)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01})
for _ in range(num_epochs):
for X, y in train_iter:
with autograd.record():
l = loss(net(X), y)
l.backward()
trainer.step(batch_size)
final_train_ls.append(loss(net(train_features), train_labels).mean().asscalar())
final_test_ls.append(loss(net(test_features), test_labels).mean().asscalar())
print('n_train: {0} - final epoch: train loss {1}, test loss {2}'.format(
n_train,
final_train_ls[-1],
final_test_ls[-1]
))
semilogy(
range(100, max_n_train + 1, 100), final_train_ls, 'n_train', 'final_loss',
range(100, max_n_train + 1, 100), final_test_ls, ['train', 'test']
)
training_error_vs_amount_data_and_plot()
n_train: 100 - final epoch: train loss 0.02717152051627636, test loss 0.03628670051693916 n_train: 200 - final epoch: train loss 0.006055120378732681, test loss 0.00601654127240181 n_train: 300 - final epoch: train loss 0.005407609045505524, test loss 0.005017670337110758 n_train: 400 - final epoch: train loss 0.005107100121676922, test loss 0.005131487734615803 n_train: 500 - final epoch: train loss 0.005201703868806362, test loss 0.005071012303233147 n_train: 600 - final epoch: train loss 0.005225712899118662, test loss 0.005004637874662876 n_train: 700 - final epoch: train loss 0.005221483763307333, test loss 0.004918545018881559 n_train: 800 - final epoch: train loss 0.005199685227125883, test loss 0.004852724261581898 n_train: 900 - final epoch: train loss 0.005092345178127289, test loss 0.0051823826506733894 n_train: 1000 - final epoch: train loss 0.005067116115242243, test loss 0.005426034331321716
%matplotlib inline
import gluonbook as gb
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
n_train, n_test, n_degree = 20, 100, 200
true_w, true_b = nd.ones((n_degree, 1)) * 0.01, 0.05
features = nd.random.normal(shape=(n_train + n_test, n_degree))
labels = nd.dot(features, true_w) + true_b
labels += nd.random.normal(scale=0.01, shape=labels.shape)
train_features, test_features = features[:n_train, :], features[n_train:, :]
train_labels, test_labels = labels[:n_train], labels[n_train:]
we simply add the $\ell_2$ penalty as an additional loss term after the target function.
Initialize model parameters
def init_params():
w = nd.random.normal(scale=1, shape=(n_degree, 1))
b = nd.zeros(shape=(1,))
w.attach_grad()
b.attach_grad()
return [w, b]
def l2_penalty(w):
return (w**2).sum() / 2
batch_size, num_epochs, lr = 1, 100, 0.003
net, loss = gb.linreg, gb.squared_loss
train_iter = gdata.DataLoader(
gdata.ArrayDataset(train_features, train_labels),
batch_size,
shuffle=True
)
def fit_and_plot(lambd):
w, b = init_params()
train_ls, test_ls = [], []
for _ in range(num_epochs):
for X, y in train_iter:
with autograd.record():
# The L2 norm penalty term has been added.
l = loss(net(X, w, b), y) + lambd * l2_penalty(w)
l.backward()
gb.sgd([w, b], lr, batch_size)
train_ls.append(loss(net(train_features, w, b), train_labels).mean().asscalar())
test_ls.append(loss(net(test_features, w, b), test_labels).mean().asscalar())
gb.semilogy(
range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
range(1, num_epochs + 1), test_ls, ['train', 'test'])
print('l2 norm of w:', w.norm().asscalar())
lambd = 0
we do not use weight decay.fit_and_plot(lambd=0)
l2 norm of w: 12.976306
fit_and_plot(lambd=3)
l2 norm of w: 0.034474522
def fit_and_plot_gluon(wd):
net = nn.Sequential()
net.add(nn.Dense(1))
net.initialize(init.Normal(sigma=1))
# The weight parameter has been decayed. Weight names generally end with "weight".
trainer_w = gluon.Trainer(
params=net.collect_params('.*weight'),
optimizer='sgd',
optimizer_params={'learning_rate': lr, 'wd': wd}
)
# The bias parameter has not been decayed. Bias names generally end with "bias".
trainer_b = gluon.Trainer(
params=net.collect_params('.*bias'),
optimizer='sgd',
optimizer_params={'learning_rate': lr}
)
train_ls, test_ls = [], []
for _ in range(num_epochs):
for X, y in train_iter:
with autograd.record():
l = loss(net(X), y)
l.backward()
# Call the step function on each of the two Trainer instances to update the weight and bias separately.
trainer_w.step(batch_size)
trainer_b.step(batch_size)
train_ls.append(loss(net(train_features), train_labels).mean().asscalar())
test_ls.append(loss(net(test_features), test_labels).mean().asscalar())
gb.semilogy(
range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
range(1, num_epochs + 1), test_ls, ['train', 'test']
)
print('L2 norm of w:', net[0].weight.data().norm().asscalar())
fit_and_plot_gluon(0)
L2 norm of w: 13.644744
fit_and_plot_gluon(3)
L2 norm of w: 0.03581347
import gluonbook as gb
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn
def dropout(X, drop_prob):
assert 0 <= drop_prob <= 1
# In this case, all elements are dropped out.
if drop_prob == 1:
return X.zeros_like()
mask = nd.random.uniform(0, 1, X.shape) > drop_prob
return mask * X / (1.0-drop_prob)
X = nd.arange(16).reshape((2, 8))
print(dropout(X, 0))
print(dropout(X, 0.5))
print(dropout(X, 1))
[[ 0. 1. 2. 3. 4. 5. 6. 7.] [ 8. 9. 10. 11. 12. 13. 14. 15.]] <NDArray 2x8 @cpu(0)> [[ 0. 0. 4. 6. 0. 10. 0. 14.] [16. 18. 0. 22. 24. 0. 0. 30.]] <NDArray 2x8 @cpu(0)> [[0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0.]] <NDArray 2x8 @cpu(0)>
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256
W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)
params = [W1, b1, W2, b2, W3, b3]
for param in params:
param.attach_grad()
autograd.is_training()
function, we can ensure that dropout is only active during training.drop_prob1, drop_prob2 = 0.2, 0.5
def net(X):
X = X.reshape((-1, num_inputs))
H1 = (nd.dot(X, W1) + b1).relu()
if autograd.is_training(): # Use dropout only when training the model.
H1 = dropout(H1, drop_prob1) # Add a dropout layer after the first fully connected layer.
H2 = (nd.dot(H1, W2) + b2).relu()
if autograd.is_training():
H2 = dropout(H2, drop_prob2) # Add a dropout layer after the second fully connected layer.
return nd.dot(H2, W3) + b3
num_epochs, lr, batch_size = 10, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)
gb.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)
epoch 1, loss 1.1481, train acc 0.553, test acc 0.775 epoch 2, loss 0.5930, train acc 0.779, test acc 0.822 epoch 3, loss 0.4910, train acc 0.819, test acc 0.851 epoch 4, loss 0.4494, train acc 0.835, test acc 0.858 epoch 5, loss 0.4231, train acc 0.846, test acc 0.856 epoch 6, loss 0.3975, train acc 0.854, test acc 0.864 epoch 7, loss 0.3810, train acc 0.860, test acc 0.874 epoch 8, loss 0.3661, train acc 0.865, test acc 0.871 epoch 9, loss 0.3555, train acc 0.871, test acc 0.880 epoch 10, loss 0.3462, train acc 0.873, test acc 0.882
Dropout
layer after the fully connected layer and specify the dropout probability.Dropout
layer will randomly drop out the output elements of the previous layer at the specified dropout probabilityDropout
layer simply passes the data through during test.net = nn.Sequential()
net.add(
nn.Dense(256, activation="relu"),
nn.Dropout(drop_prob1), # Add a dropout layer after the first fully connected layer.
nn.Dense(256, activation="relu"),
nn.Dropout(drop_prob2), # Add a dropout layer after the second fully connected layer.
nn.Dense(10)
)
net.initialize(init.Normal(sigma=0.01))
num_epochs, lr, batch_size = 10, 0.5, 256
train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
gb.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
epoch 1, loss 1.1681, train acc 0.551, test acc 0.795 epoch 2, loss 0.5893, train acc 0.782, test acc 0.825 epoch 3, loss 0.4937, train acc 0.819, test acc 0.854 epoch 4, loss 0.4530, train acc 0.835, test acc 0.857 epoch 5, loss 0.4243, train acc 0.845, test acc 0.858 epoch 6, loss 0.4061, train acc 0.851, test acc 0.866 epoch 7, loss 0.3837, train acc 0.861, test acc 0.872 epoch 8, loss 0.3717, train acc 0.864, test acc 0.873 epoch 9, loss 0.3602, train acc 0.869, test acc 0.878 epoch 10, loss 0.3517, train acc 0.871, test acc 0.874