Single linear transformation $$ \hat{\mathbf{o}} = \mathrm{softmax}(\mathbf{W} \mathbf{x} + \mathbf{b}) $$
From one to many
From linear to nonlinear
Multilayer perceptrons are universal approximators.
Vectorization and mini-batch
%matplotlib inline
import gluonbook as gb
from mxnet import autograd, nd
def xyplot(x_vals, y_vals, name):
gb.set_figsize(figsize=(5, 2.5))
gb.plt.plot(x_vals.asnumpy(), y_vals.asnumpy())
gb.plt.xlabel('x')
gb.plt.ylabel(name + '(x)')
x = nd.arange(-8.0, 8.0, 0.1)
x.attach_grad()
with autograd.record():
y = x.relu()
xyplot(x, y, 'relu')
y.backward()
xyplot(x, x.grad, 'grad of relu')
with autograd.record():
y = x.sigmoid()
xyplot(x, y, 'sigmoid')
y.backward()
xyplot(x, x.grad, 'grad of sigmoid')
with autograd.record():
y = x.tanh()
xyplot(x, y, 'tanh')
y.backward()
xyplot(x, x.grad, 'grad of tanh')
%matplotlib inline
import gluonbook as gb
from mxnet import nd
from mxnet.gluon import loss as gloss
batch_size = 256
train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)
num_inputs, num_outputs, num_hiddens = 784, 10, 256
W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens))
b1 = nd.zeros(num_hiddens)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens, num_outputs))
b2 = nd.zeros(num_outputs)
params = [W1, b1, W2, b2]
for param in params:
param.attach_grad()
def relu(X):
return nd.maximum(X, 0)
def net(X):
X = X.reshape((-1, num_inputs)) # X: (-1, 784), W1: (784, 256)
H = relu(nd.dot(X, W1) + b1) # nd.dot(X, W1): (-1, 256), b1: (256,), W2: (256, 10)
return nd.dot(H, W2) + b2 # nd.dot(H, W2): (-1, 10), b2: (10,)
loss = gloss.SoftmaxCrossEntropyLoss()
num_epochs, lr = 10, 0.5
gb.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)
epoch 1, loss 0.8047, train acc 0.699, test acc 0.791 epoch 2, loss 0.4860, train acc 0.818, test acc 0.840 epoch 3, loss 0.4252, train acc 0.842, test acc 0.861 epoch 4, loss 0.3975, train acc 0.852, test acc 0.861 epoch 5, loss 0.3736, train acc 0.862, test acc 0.873 epoch 6, loss 0.3519, train acc 0.871, test acc 0.875 epoch 7, loss 0.3374, train acc 0.875, test acc 0.875 epoch 8, loss 0.3235, train acc 0.882, test acc 0.880 epoch 9, loss 0.3195, train acc 0.882, test acc 0.877 epoch 10, loss 0.3070, train acc 0.886, test acc 0.884
for X, y in test_iter:
break
true_labels = gb.get_fashion_mnist_labels(y.asnumpy())
pred_labels = gb.get_fashion_mnist_labels(net(X).argmax(axis=1).asnumpy())
titles = [truelabel + '\n' + predlabel for truelabel, predlabel in zip(true_labels, pred_labels)]
gb.show_fashion_mnist(X[0:9], titles[0:9])
import gluonbook as gb
from mxnet import gluon, init
from mxnet.gluon import loss as gloss, nn
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
batch_size = 256
train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)
loss = gloss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})
num_epochs = 10
gb.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
epoch 1, loss 0.8092, train acc 0.700, test acc 0.816 epoch 2, loss 0.4949, train acc 0.816, test acc 0.846 epoch 3, loss 0.4288, train acc 0.843, test acc 0.862 epoch 4, loss 0.3998, train acc 0.853, test acc 0.867 epoch 5, loss 0.3741, train acc 0.862, test acc 0.862 epoch 6, loss 0.3517, train acc 0.870, test acc 0.874 epoch 7, loss 0.3437, train acc 0.874, test acc 0.878 epoch 8, loss 0.3287, train acc 0.879, test acc 0.880 epoch 9, loss 0.3201, train acc 0.882, test acc 0.876 epoch 10, loss 0.3086, train acc 0.886, test acc 0.877
for X, y in test_iter:
break
true_labels = gb.get_fashion_mnist_labels(y.asnumpy())
pred_labels = gb.get_fashion_mnist_labels(net(X).argmax(axis=1).asnumpy())
titles = [truelabel + '\n' + predlabel for truelabel, predlabel in zip(true_labels, pred_labels)]
gb.show_fashion_mnist(X[0:9], titles[0:9])