In [10]:
# Classical
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# PyTorch
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim

# Misc
import pdb
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = 15, 7


This notebook is inspired by Sebastian Ruder's awesome work... from http://ruder.io/optimizing-gradient-descent/. The only think we don't implement in this notebook are gradient calculations, which are conviniently provided by PyTorch.

### References¶

In [2]:
data = pd.read_csv('./data/MNIST/train.csv')
data = np.array(data)

In [3]:
class MNIST(Dataset):
def __init__(self, data):
self.data = data

def __getitem__(self, index):
X = data[index][1:]
y = data[index][0]

def __len__(self):
return len(self.data)

In [4]:
class SimpleNet(nn.Module):
def __init__(self, layers):
super().__init__()
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i + 1]) for i in range(len(layers) - 1)])

def forward(self, x):
for lin in self.linears:
lin_x = lin(x)
x = F.relu(lin_x)
return F.log_softmax(lin_x, dim=1)

In [5]:
# Create dataset
mnist = MNIST(data)
data_dl = DataLoader(mnist, batch_size= 256, shuffle=True, num_workers=0)


### Training With DIfferent Optimizers¶

In [6]:
epochs = 3 # set epohcs
criterion = nn.NLLLoss() # define loss function


### Vanilla Mini Batch - SGD¶

In [ ]:
### Algorithm
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):

In [7]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

###params
lr = 1e-2

### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
for w, b in wbs:


epoch 0
epoch 1
epoch 2

In [8]:
sgd_losses_ = [(l.data.numpy()[0]) for l in losses]
sgd_log_losses_ = [np.log(l) for l in sgd_losses_]

In [11]:
plt.plot(sgd_log_losses_)
title = plt.title("Vanilla SGD")


### PyTorch Built-in¶

Here you can test various optimizers by simply changing optim.(optimizer ). It's also nice to see each optimizer has it's own nature and hence kind of seem to need a unique lr. That's actually shows the power of and nedd for the lr_find().

In [201]:
### Training Pytorch
### init architecture
snet = SimpleNet([784, 100, 100, 10])
optimizer = optim.(lr = lr, params=snet.parameters())
losses = []
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
optimizer.step()

epoch 0
epoch 1
epoch 2

In [202]:
pytorch_losses_ = [(l.data.numpy()[0]) for l in losses]
pytorch_log_losses_ = [np.log(l) for l in pytorch_losses_]

In [36]:
plt.plot(pytorch_log_losses_)
title = plt.title("PyTorch vs Custom Loss")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-36-4a947f3d9a63> in <module>()
----> 1 plt.plot(pytorch_log_losses_)
3 title = plt.title("PyTorch vs Custom Loss")

NameError: name 'pytorch_log_losses_' is not defined

### SGD with momentum¶

We'd like to pass saddle points with the use of momentum, if not it's faster...

In [ ]:
### Algorihtm
v_new = 0 #init update
rho = 0.9 #set rho
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
v_new = rho*v_prev + lr*compute.grads(batch, weight, loss_func)
params -= v_new
v_prev = v_new

In [14]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

###params
lr = 1e-3
rho = 0.9
weight_v_prev = [0 for i in range(len(wbs))] # initialize momentum term
bias_v_prev = [0 for i in range(len(wbs))] # initialize momentum term

epochs = 3

### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]

weight_v_prev[i] = weight_v_new
bias_v_prev[i] = bias_v_new

w.data -= weight_v_new
b.data -= bias_v_new


epoch 0
epoch 1
epoch 2

In [15]:
sgdmom_losses_ = [(l.data.numpy()[0]) for l in losses]
sgdmom_log_losses_ = [np.log(l) for l in sgdmom_losses_]

In [16]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
title = plt.title("Evolution of Optimizers")
plt.legend(('SGD', 'SGD Momentum'))

Out[16]:
<matplotlib.legend.Legend at 0x7fad4fe65a58>

### Nesterov¶

We'd like to have a smarter ball, a ball that has a notion of where it is going so that it knows to slow down before the hill slopes up again. While computing grads wrt to weight - rho*v_prev, we have a sense of what will be the next position of our ball will, so we can leverage this and make a better update.

In [ ]:
###Algorithm
v_prev = 0 #init update
rho = 0.9 #set rho
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
v_new = rho*v_prev + lr*compute.grads(batch, params - rho*v_prev, loss_func)
params -= v_new
v_prev = v_new

In [17]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

###params
lr = 1e-3
rho = 0.9
weight_v_prev = [0 for i in range(len(wbs))] # initialize momentum term
bias_v_prev = [0 for i in range(len(wbs))] # initialize momentum term

epochs = 3

### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for n, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

# compute loss
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)

# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]

### WEIGHT UPDATE
# take a step in future as if we are updating
w_original = w.data
w.data -= rho*weight_v_prev[i]
# caluclate loss and gradients, to see how would it be
future_outputs = snet(inputs)
future_loss = criterion(outputs, labels.squeeze(-1))
future_loss.backward(retain_graph=True)

weight_v_prev[i] = weight_v_new
w.data = w_original #get the original weight data
w.data -= weight_v_new #update

### BIAS UPDATE
# take a step in future as if we are updating
b_original = b.data
b.data -= rho*bias_v_prev[i]

# calculate loss and gradient, to see how would it be
future_outputs = snet(inputs)
future_loss = criterion(outputs, labels.squeeze(-1))
future_loss.backward(retain_graph=True)

bias_v_prev[i] = bias_v_new
b.data = b_original
b.data -= bias_v_new


epoch 0
epoch 1
epoch 2

In [18]:
nesterov_losses_ = [(l.data.numpy()[0]) for l in losses]
nesterov_log_losses_ = [np.log(l) for l in nesterov_losses_]

In [19]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
title = plt.title("Evolution of Optimizers")
plt.legend(('SGD', 'SGD Momentum', 'Nesterov'))

Out[19]:
<matplotlib.legend.Legend at 0x7fad4fd97198>

It adapts the learning rate to the parameters, performing larger updates for infrequent and smaller updates for frequent parameters. In its update rule, Adagrad modifies the general learning rate Î·Î· at each time step tt for every parameter Î¸iÎ¸i based on the past gradients that have been computed for Î¸iÎ¸i:

One of Adagrad's main benefits is that it eliminates the need to manually tune the learning rate. Most implementations use a default value of 0.01 and leave it at that.

Adagrad's main weakness is its accumulation of the squared gradients in the denominator: Since every added term is positive, the accumulated sum keeps growing during training. This in turn causes the learning rate to shrink and eventually become infinitesimally small, at which point the algorithm is no longer able to acquire additional knowledge. The following algorithms aim to resolve this flaw.

In [ ]:
###Algorithm
noise = 1e-8
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):

In [20]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

###params
lr = 1e-3
grads_squared = [[torch.zeros(wb[0].size()), torch.zeros(wb[1].size())] for wb in wbs]
noise = 1e-8

epochs = 3

### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]


epoch 0
epoch 1
epoch 2

In [21]:
adagrad_losses_ = [(l.data.numpy()[0]) for l in losses]

In [22]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
title = plt.title("Evolution of Optimizers")

Out[22]:
<matplotlib.legend.Legend at 0x7fad4fcf8f60>

Instead of inefficiently storing ww previous squared gradients, the sum of gradients is recursively defined as a decaying average of all past squared gradients. The running average E[g2]tE[g2]t at time step tt then depends (as a fraction Î³Î³ similarly to the Momentum term) only on the previous average and the current gradient:

RMSprop and Adadelta have both been developed independently around the same time stemming from the need to resolve Adagrad's radically diminishing learning rates. RMSprop in fact is identical to the first update vector of Adadelta that we derived above:

In [ ]:
###Algorithm
rho = 0.9 # param for exponential smoothing on grad squares
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):

In [23]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

###params
lr = 1e-3
grads_squared = [[torch.zeros_like(wb[0]).data, torch.zeros_like(wb[1]).data] for wb in wbs]
noise = 1e-8
rho = 0.9
epochs = 3

### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]


epoch 0
epoch 1
epoch 2

In [24]:
rmsprop_losses_ = [(l.data.numpy()[0]) for l in losses]
rmsprop_log_losses_ = [np.log(l) for l in rmsprop_losses_]

In [25]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
plt.plot(rmsprop_log_losses_)
title = plt.title("Evolution of Optimizers")

Out[25]:
<matplotlib.legend.Legend at 0x7fad4fc04908>

In addition to storing an exponentially decaying average of past squared gradients vtvt like Adadelta and RMSprop, Adam also keeps an exponentially decaying average of past gradients mtmt, similar to momentum.As mtmt and vtvt are initialized as vectors of 0's, the authors of Adam observe that they are biased towards zero, especially during the initial time steps, and especially when the decay rates are small (i.e. Î²1Î²1 and Î²2Î²2 are close to 1).

They counteract these biases by computing bias-corrected first and second moment estimates:

In [ ]:
###Algorithm
m = 0
v = 0
beta1 = 0.999
beta2 = 1e-8
t = 0

for i in range(epochs):
t += 1
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
m = beta1*m + (1 - beta1)*grads
m_hat = m / (1 - beta1**t) # bias correction for first moment
v_hat = v / (1 - beta1**t) # bias correction for second moment
params -= lr*m_hat/(np.sqrt(v_hat) + noise)

In [26]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

### params
lr = 1e-3

### [((m, v), (m, v))] weight and bias m v prevs
m_v_prevs = [[[torch.zeros_like(wb[0]).data, torch.zeros_like(wb[0]).data], [torch.zeros_like(wb[1]).data, torch.zeros_like(wb[1]).data]]  for wb in wbs]

noise = 1e-8
beta1 = 0.9
beta2 = 0.999

epochs = 3

t = 0
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):

# keep track of time
t += 1

inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]

# update weight component
w_m_v_t_prev = m_v_prevs[i][0]
w_m_t_prev = w_m_v_t_prev[0]
w_v_t_prev = w_m_v_t_prev[1]

w_m_t_new = beta1*w_m_t_prev + (1 - beta1)*w.grad.data

w_m_t_new_hat = w_m_t_new / (1 - beta1**t)
w_v_t_new_hat = w_v_t_new / (1 - beta2**t)

m_v_prevs[i][0][0] = w_m_t_new
m_v_prevs[i][0][1] = w_v_t_new

w.data -= lr*w_m_t_new_hat / (torch.sqrt(w_v_t_new_hat) + noise)

# update bias component
b_m_v_t_prev = m_v_prevs[i][1]
b_m_t_prev = b_m_v_t_prev[0]
b_v_t_prev = b_m_v_t_prev[1]

b_m_t_new = beta1*b_m_t_prev + (1 - beta1)*b.grad.data

b_m_t_new_hat = b_m_t_new / (1 - beta1**t)
b_v_t_new_hat = b_v_t_new / (1 - beta2**t)

m_v_prevs[i][1][0] = b_m_t_new
m_v_prevs[i][1][1] = b_v_t_new

b.data -= lr*b_m_t_new_hat / (torch.sqrt(b_v_t_new_hat) + noise)


epoch 0
epoch 1
epoch 2

In [27]:
adam_losses_ = [(l.data.numpy()[0]) for l in losses]

In [28]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
plt.plot(rmsprop_log_losses_)
title = plt.title("Evolution of Optimizers")

Out[28]:
<matplotlib.legend.Legend at 0x7fad4fb9e780>

The vtvt factor in the Adam update rule scales the gradient inversely proportionally to the â„“2â„“2 norm of the past gradients (via the vtâˆ’1vtâˆ’1 term) and current gradient |gt|2|gt|2:

vt=Î²2vtâˆ’1+(1âˆ’Î²2)|gt|2vt=Î²2vtâˆ’1+(1âˆ’Î²2)|gt|2 We can generalize this update to the â„“pâ„“p norm. Note that Kingma and Ba also parameterize Î²2Î²2 as Î²p2Î²2p:

vt=Î²p2vtâˆ’1+(1âˆ’Î²p2)|gt|pvt=Î²2pvtâˆ’1+(1âˆ’Î²2p)|gt|p Norms for large pp values generally become numerically unstable, which is why â„“1â„“1 and â„“2â„“2 norms are most common in practice. However, â„“âˆžâ„“âˆž also generally exhibits stable behavior. For this reason, the authors propose AdaMax (Kingma and Ba, 2015) and show that vtvt with â„“âˆžâ„“âˆž converges to the following more stable value. To avoid confusion with Adam, we use utut to denote the infinity norm-constrained vtvt:

In [188]:
###Algorithm
### Even though this is the original I needed add a noise term into denominator of weight update term (u + noise)
m = 0
u = 0
beta1 = 0.999
beta2 = 1e-8
t = 0

for i in range(epochs):
t += 1
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
m = beta1*m + (1 - beta1)*grads
m_hat = m / (1 - beta1**t) # bias correction for first moment
params -= lr*m_hat/(u)

In [29]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

### params
lr = 1e-3
beta1 = 0.9
beta2 = 0.999
noise = 1e-22

### [((m, v), (m, v))] weight and bias m v prevs
m_u_prevs = [[[torch.zeros_like(wb[0]).data, torch.zeros_like(wb[0]).data], [torch.zeros_like(wb[1]).data, torch.zeros_like(wb[1]).data]]  for wb in wbs]

epochs = 3
t = 0
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):

# keep track of time
t += 1

inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]

# update weight component
w_m_u_t_prev = m_u_prevs[i][0]
w_m_t_prev = w_m_u_t_prev[0]
w_u_t_prev = w_m_u_t_prev[1]

w_m_t_new = beta1*w_m_t_prev + (1 - beta1)*w.grad.data

w_m_t_new_hat = w_m_t_new / (1 - beta1**t)

m_u_prevs[i][0][0] = w_m_t_new
m_u_prevs[i][0][1] = w_u_t_new

w.data -= lr*w_m_t_new_hat / (w_u_t_new + noise)

# update bias component
b_m_u_t_prev = m_u_prevs[i][1]
b_m_t_prev = b_m_u_t_prev[0]
b_u_t_prev = b_m_u_t_prev[1]

b_m_t_new = beta1*b_m_t_prev + (1 - beta1)*b.grad.data

b_m_t_new_hat = b_m_t_new / (1 - beta1**t)

m_u_prevs[i][1][0] = b_m_t_new
m_u_prevs[i][1][1] = b_u_t_new

b.data -= lr*b_m_t_new_hat / (b_u_t_new + noise)


epoch 0
epoch 1
epoch 2

In [30]:
adamax_losses_ = [(l.data.numpy()[0]) for l in losses]

In [31]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
plt.plot(rmsprop_log_losses_)
title = plt.title("Evolution of Optimizers")

Out[31]:
<matplotlib.legend.Legend at 0x7fad4ab8c278>

### Nadam (Currently Not Available in Pytorch - Merge is waiting...)¶

As we have seen before, Adam can be viewed as a combination of RMSprop and momentum: RMSprop contributes the exponentially decaying average of past squared gradients vtvt, while momentum accounts for the exponentially decaying average of past gradients mtmt. We have also seen that Nesterov accelerated gradient (NAG) is superior to vanilla momentum.

Nadam (Nesterov-accelerated Adaptive Moment Estimation) [24] thus combines Adam and NAG. In order to incorporate NAG into Adam, we need to modify its momentum term mtmt.

In [ ]:
m = 0
v = 0
beta1 = 0.999
beta2 = 1e-8
t = 0

for i in range(epochs):
t += 1
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
m = beta1*m + (1 - beta1)*grads
m_hat = m / (1 - beta1**t) # bias correction for first moment
v_hat = v / (1 - beta1**t) # bias correction for second moment
params -= lr*(beta1*m_hat_new + ((1 - beta1)*grads)/(1 - beta1**t))/(np.sqrt(v_hat) + noise)

In [32]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])

### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]

### keep track of training loss
losses = []

### params
lr = 1e-3

### [((m, v), (m, v))] weight and bias m v prevs
m_v_prevs = [[[torch.zeros_like(wb[0]).data, torch.zeros_like(wb[0]).data], [torch.zeros_like(wb[1]).data, torch.zeros_like(wb[1]).data]]  for wb in wbs]

noise = 1e-8
beta1 = 0.9
beta2 = 0.999

epochs = 3

t = 0
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):

# keep track of time
t += 1

inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)

loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()

# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]

# update weight component
w_m_v_t_prev = m_v_prevs[i][0]
w_m_t_prev = w_m_v_t_prev[0]
w_v_t_prev = w_m_v_t_prev[1]

w_m_t_new = beta1*w_m_t_prev + (1 - beta1)*w.grad.data

w_m_t_new_hat = w_m_t_new / (1 - beta1**t)
w_v_t_new_hat = w_v_t_new / (1 - beta2**t)

w.data -= lr*((beta1*w_m_t_new_hat) + ((1 - beta1)*w.grad.data)/(1 - beta1**t)) / (torch.sqrt(w_v_t_new_hat) + noise)

m_v_prevs[i][0][0] = w_m_t_new
m_v_prevs[i][0][1] = w_v_t_new

# update bias component
b_m_v_t_prev = m_v_prevs[i][1]
b_m_t_prev = b_m_v_t_prev[0]
b_v_t_prev = b_m_v_t_prev[1]

b_m_t_new = beta1*b_m_t_prev + (1 - beta1)*b.grad.data

b_m_t_new_hat = b_m_t_new / (1 - beta1**t)
b_v_t_new_hat = b_v_t_new / (1 - beta2**t)

b.data -= lr*((beta1*b_m_t_new_hat) + ((1 - beta1)*b.grad.data)/(1 - beta1**t)) / (torch.sqrt(b_v_t_new_hat) + noise)

m_v_prevs[i][1][0] = b_m_t_new
m_v_prevs[i][1][1] = b_v_t_new


epoch 0
epoch 1
epoch 2

In [33]:
nadam_losses_ = [(l.data.numpy()[0]) for l in losses]

In [34]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
plt.plot(rmsprop_log_losses_)
title = plt.title("Evolution of Optimizers")

<matplotlib.legend.Legend at 0x7fad48256780>
plt.plot(adam_log_losses_)

<matplotlib.legend.Legend at 0x7fad481d4588>