from mxnet import ndarray as nd
# Adadalta.
def adadelta(params, sqrs, deltas, rho, batch_size):
eps_stable = 1e-5
for param, sqr, delta in zip(params, sqrs, deltas):
g = param.grad / batch_size
sqr[:] = rho * sqr + (1. - rho) * nd.square(g)
cur_delta = nd.sqrt(delta + eps_stable) / nd.sqrt(sqr + eps_stable) * g
delta[:] = rho * delta + (1. - rho) * cur_delta * cur_delta
# update weight
param[:] -= cur_delta
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
import random
mx.random.seed(1)
random.seed(1)
# Generate data.
num_inputs = 2
num_examples = 1000
true_w = [2, -3.4]
true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# Construct data iterator.
def data_iter(batch_size):
idx = list(range(num_examples))
random.shuffle(idx)
for batch_i, i in enumerate(range(0, num_examples, batch_size)):
j = nd.array(idx[i: min(i + batch_size, num_examples)])
yield batch_i, X.take(j), y.take(j)
# Initialize model parameters.
def init_params():
w = nd.random_normal(scale=1, shape=(num_inputs, 1))
b = nd.zeros(shape=(1,))
params = [w, b]
sqrs = []
deltas = []
for param in params:
param.attach_grad()
#
sqrs.append(param.zeros_like())
deltas.append(param.zeros_like())
return params, sqrs, deltas
# Linear regression.
def net(X, w, b):
return nd.dot(X, w) + b
# Loss function.
def square_loss(yhat, y):
return (yhat - y.reshape(yhat.shape)) ** 2 / 2
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
import numpy as np
def train(batch_size, rho, epochs, period):
assert period >= batch_size and period % batch_size == 0
[w, b], sqrs, deltas = init_params()
total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
# Epoch starts from 1.
for epoch in range(1, epochs + 1):
for batch_i, data, label in data_iter(batch_size):
with autograd.record():
output = net(data, w, b)
loss = square_loss(output, label)
loss.backward()
adadelta([w, b], sqrs, deltas, rho, batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
print("Batch size %d, Epoch %d, loss %.4e" %
(batch_size, epoch, total_loss[-1]))
print('w:', np.reshape(w.asnumpy(), (1, -1)),
'b:', b.asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
train(batch_size=10, rho=0.9999, epochs=3, period=10)
Batch size 10, Epoch 1, loss 5.2081e-05 Batch size 10, Epoch 2, loss 4.9538e-05 Batch size 10, Epoch 3, loss 4.9217e-05 w: [[ 1.99959445 -3.3999126 ]] b: 4.19964
For whinges or inquiries, open an issue on GitHub.