or $$y_{\textrm{pred}} = w_{0} + w_{1} x_{1} + w_{2} x_{2} + \cdots + w_{d} x_{d},$$ where $w_{0} = b$.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LogNorm
from matplotlib import animation
from IPython.display import HTML
from itertools import zip_longest
We make a simple dataset based on model which follows equation
$$y = a x + b + \varepsilon, \quad \varepsilon \sim \mathcal{N}(0, \sigma_{x})$$where $a = 4$, $b = -3$, the error term $\varepsilon$ is sampled on normal distributon which variance $\sigma$ depends on the value of $x$ for educational purpose.
np.random.seed(219)
N = 200
a = 4
b = -2
low = -3.0
high = 4.0
data_x = np.random.uniform(low=low, high=high, size=N)
data_y = np.zeros(N)
for i, x in enumerate(data_x):
scale = - (x - low) * (x - high) / 3. + 1.5
data_y[i] = a * x + b + np.random.normal(loc=0.0, scale=scale, size=1)
plt.plot(data_x, data_y, 'ro')
plt.axhline(0, color='black', lw=1)
plt.axvline(0, color='black', lw=1)
plt.show()
Linear regression model is $$y_{\textrm{pred}} = \boldsymbol{w}^{\top}\boldsymbol{x} + b$$ or $$y_{\textrm{pred}} = w_{0} + w_{1} x_{1} + w_{2} x_{2} + \cdots + w_{d} x_{d},$$ where $w_{0} = b$.
Extend the class of models by considering linear combinations of fixed nonlinear functions of the input variables
$$y_{\textrm{pred}} = w_{0} + w_{1} \phi_{1}(\mathbf{x}) + w_{2} \phi_{2}(\mathbf{x}) + \cdots + w_{M-1} \phi_{M-1}(\mathbf{x}),$$$$y_{\textrm{pred}} = w_{0} + \sum_{j}^{M-1} w_{j} \phi_{j}(\mathbf{x}).$$$\phi_{j}(\mathbf{x})$ is called basis function. And add dummy 'basis function' $\phi_{0}(\mathbf{x}) = 1$ so that
$$y_{\textrm{pred}} = \sum_{j}^{M-1} w_{j} \phi_{j}(\mathbf{x}) = \mathbf{w}^{\top} \boldsymbol{\phi}(\mathbf{x}).$$where $\mathbf{\Phi}$ is a $N \times M$ matrix, called design matrix
$$\mathbf{\Phi} = \left( \begin{array}{llll} \phi_{0}(\mathbf{x_{1}}) & \phi_{1}(\mathbf{x_{1}}) & \cdots & \phi_{M-1}(\mathbf{x_{1}})\\ \phi_{0}(\mathbf{x_{2}}) & \phi_{1}(\mathbf{x_{2}}) & \cdots & \phi_{M-1}(\mathbf{x_{2}})\\ \vdots & \vdots & \ldots & \vdots\\ \phi_{0}(\mathbf{x_{N}}) & \phi_{1}(\mathbf{x_{N}}) & \cdots & \phi_{M-1}(\mathbf{x_{N}}) \end{array} \right).$$And $\mathbf{Y}$ is a target vector (label data)
# using data_x, data_y
X = np.concatenate((data_x.reshape(N, 1), np.ones(N).reshape(N, 1)), axis=1)
Y = data_y.reshape(N, 1)
np.linalg.inv(A)
%time
A = np.matmul(X.T, X)
invA = np.linalg.inv(A)
B = np.matmul(X.T, Y)
W_exact = np.matmul(invA, B)
W_exact = np.squeeze(W_exact)
CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs Wall time: 12.9 µs
#minima = W_exact.reshape(2, 1) # for 3D plot and contour plot
minima = np.flip(W_exact).reshape(2, 1) # for 3D plot and contour plot
print("Real parameters used creating the data")
print("w: {:.4f} b: {:.4f}".format(a, b))
print("Exact Solution using the normal equation")
print("w: {:.4f} b: {:.4f}".format(W_exact[0], W_exact[1]))
Real parameters used creating the data w: 4.0000 b: -2.0000 Exact Solution using the normal equation w: 4.1533 b: -2.2426
for epoch in max_epochs: # one epoch: when an ENTIRE dataset is passed through the model only ONCE
for step in num_batches: # num_batches = int(data_size / batch_size)
1. sampling mini-batches with batch_size
1-1. data augmentation (when you need)
2. calculate the logits # logits = f(x)
3. calculate the loss # loss = loss(logits, labels)
4. calculate the gradient with respect to weights
5. update weights
class LinearRegression(object):
def __init__(self, data_x, data_y, method='sgd',
w_init=None, b_init=None, learning_rate=0.1, momentum=0.9):
assert method in ['sgd', 'momentum', 'adagrad', 'rmsprop', 'adam']
self.method = method
scale = 4.0
if w_init is not None:
self.w = w_init
else:
self.w = np.random.uniform(low=a-scale, high=a+scale)
if b_init is not None:
self.b = b_init
else:
self.b = np.random.uniform(low=b-scale, high=b+scale)
print("w_init: {:.3f}".format(self.w))
print("b_init: {:.3f}".format(self.b))
self.x = data_x
self.y = data_y
self.lr = learning_rate
if self.method == 'momentum':
self.momentum = momentum
self.vw = 0.0
self.vb = 0.0
if self.method == 'adagrad':
self.dw_squared = 0.1
self.db_squared = 0.1
self.epsilon = 1e-7
if self.method == 'rmsprop':
self.dw_squared = 0.0
self.db_squared = 0.0
self.decay = 0.9
self.epsilon = 1e-10
if self.method == 'adam':
self.dw_first_moment = 0.0
self.dw_second_moment = 0.0
self.db_first_moment = 0.0
self.db_second_moment = 0.0
self.beta1 = 0.9
self.beta2 = 0.999
self.epsilon = 1e-8
# for accumulation of loss and path (w, b)
self.loss_history = []
self.w_history = []
self.b_history = []
def inference(self, x):
"""Inference function for a linear model
y_pred = w * x + b.
Args:
x: full-batch data, shape: (1-rank Tensor (vector) np.array)
Returns:
y_pred: full-batch y_pred, shape: (1-rank Tensor (vector) np.array)
"""
y_pred = self.w * x + self.b
return y_pred
def loss_for_plot(self, w, b):
"""List of loss function with respect to given list of (w, b).
Args:
w: shape: (1-rank Tensor (vector) np.array)
b: shape: (1-rank Tensor (vector) np.array)
Returns:
loss_for_plot: shape: (1-rank Tensor (vector) np.array)
"""
y_pred = np.matmul(np.expand_dims(self.x, axis=1), np.expand_dims(w, axis=0)) + b
loss_for_plot = 0.5 * (y_pred - np.expand_dims(self.y, axis=1))**2
loss_for_plot = np.mean(loss_for_plot, axis=0)
return loss_for_plot
def loss_fn(self, labels, predictions):
"""Loss function.
MSE loss
Args:
labels: target data y, shape: (1-rank Tensor (vector) np.array)
predictions: model inference y_pred, shape: (1-rank Tensor (vector) np.array)
Returns:
loss: mean value of loss for full-batch data, shape: (0-rank Tensor (scalar))
"""
loss = 0.5 * np.mean((predictions - labels)**2)
return loss
def loss_derivative(self):
"""Loss derivative.
Returns:
dw: dL / dw, mean value of derivatives for full-batch data, shape: (0-rank Tensor (scalar))
db: dL / db, mean value of derivatives for full-batch data, shape: (0-rank Tensor (scalar))
"""
dw = np.mean((self.y_pred - self.y) * self.x)
db = np.mean(self.y_pred - self.y)
return dw, db
def weights_update(self):
"""Weights update using Gradient descent.
w' = w - lr * dL/dw
"""
self.w = self.w - self.lr * self.dw
self.b = self.b - self.lr * self.db
self.uw = - self.lr * self.dw
self.ub = - self.lr * self.db
def weights_update_momentum(self):
"""Weights update using Momentum.
v' = gamma * v - lr * dL/dw
w' = w + v'
"""
self.vw = self.momentum * self.vw - self.lr * self.dw
self.w = self.w + self.vw
self.vb = self.momentum * self.vb - self.lr * self.db
self.b = self.b + self.vb
self.uw = self.vw
self.ub = self.vb
def weights_update_adagrad(self):
"""Weights update using adagrad.
grads2 = grads2 + grads**2
w' = w - lr * grads / (sqrt(grads2) + epsilon)
"""
self.dw_squared = self.dw_squared + self.dw**2
self.w = self.w - self.lr * self.dw / (np.sqrt(self.dw_squared) + self.epsilon)
self.db_squared = self.db_squared + self.db**2
self.b = self.b - self.lr * self.db / (np.sqrt(self.db_squared) + self.epsilon)
self.uw = - self.lr * self.dw / (np.sqrt(self.dw_squared) + self.epsilon)
self.ub = - self.lr * self.db / (np.sqrt(self.db_squared) + self.epsilon)
def weights_update_rmsprop(self):
"""Weights update using RMSprop.
grads2 = decay * grads2 + (1 - decay) * grad2**2
w' = w - lr * graps / (sqrt(grad2) + epsilon)
"""
self.dw_squared = self.decay * self.dw_squared + (1. - self.decay) * self.dw**2
self.w = self.w - self.lr * self.dw / (np.sqrt(self.dw_squared) + self.epsilon)
self.db_squared = self.decay * self.db_squared + (1. - self.decay) * self.db**2
self.b = self.b - self.lr * self.db / (np.sqrt(self.db_squared) + self.epsilon)
self.uw = - self.lr * self.dw / (np.sqrt(self.dw_squared) + self.epsilon)
self.ub = - self.lr * self.db / (np.sqrt(self.db_squared) + self.epsilon)
def weights_update_adam(self, time):
"""Weights update using Adam.
g1 = beta1 * g1 + (1 - beta1) * grads
g2 = beta2 * g2 + (1 - beta2) * g2
g1_unbiased = g1 / (1 - beta1**time)
g2_unbiased = g2 / (1 - beta2**time)
w = w - lr * g1_unbiased / (sqrt(g2_unbiased) + epsilon)
"""
self.dw_first_moment = self.beta1 * self.dw_first_moment + (1. - self.beta1) * self.dw
self.dw_second_moment = self.beta2 * self.dw_second_moment + (1. - self.beta2) * self.dw**2
self.db_first_moment = self.beta1 * self.db_first_moment + (1. - self.beta1) * self.db
self.db_second_moment = self.beta2 * self.db_second_moment + (1. - self.beta2) * self.db**2
self.dw_first_moment_unbiased = self.dw_first_moment / (1. - self.beta1**time)
self.dw_second_moment_unbiased = self.dw_second_moment / (1. - self.beta2**time)
self.db_first_moment_unbiased = self.db_first_moment / (1. - self.beta1**time)
self.db_second_moment_unbiased = self.db_second_moment / (1. - self.beta2**time)
self.w = self.w - self.lr * self.dw_first_moment_unbiased / (np.sqrt(self.dw_second_moment_unbiased) + self.epsilon)
self.b = self.b - self.lr * self.db_first_moment_unbiased / (np.sqrt(self.db_second_moment_unbiased) + self.epsilon)
self.uw = - self.lr * self.dw_first_moment_unbiased / (np.sqrt(self.dw_second_moment_unbiased) + self.epsilon)
self.ub = - self.lr * self.db_first_moment_unbiased / (np.sqrt(self.db_second_moment_unbiased) + self.epsilon)
def history_update(self, loss, w, b):
"""Accumulate all interesting variables
"""
self.loss_history.append(loss)
self.w_history.append(w)
self.b_history.append(b)
def train(self, max_epochs):
pre_loss = 0.0
print("epochs: {} w: {:.5f} b: {:.5f}".format(0, self.w, self.b))
for epoch in range(max_epochs):
self.y_pred = self.inference(self.x)
self.loss = self.loss_fn(self.y, self.y_pred)
self.history_update(self.loss, self.w, self.b)
self.dw, self.db = self.loss_derivative()
if self.method == 'momentum':
self.weights_update_momentum()
elif self.method == 'adagrad':
self.weights_update_adagrad()
elif self.method == 'rmsprop':
self.weights_update_rmsprop()
elif self.method == 'adam':
self.weights_update_adam(epoch+1)
else:
self.weights_update()
if epoch % 10 == 0:
print("epochs: {} loss: {:.6f} w: {:.5f} b: {:.5f} dw: {:.5f} db: {:.5f} uw: {:.5f} ub: {:.5f}".format(epoch+1, self.loss, self.w, self.b, self.dw, self.db, self.uw, self.ub))
if np.abs(pre_loss - self.loss) < 1e-6:
self.loss = self.loss_fn(self.y, self.y_pred)
self.history_update(self.loss, self.w, self.b)
break
pre_loss = self.loss
self.w_history = np.array(self.w_history)
self.b_history = np.array(self.b_history)
#self.path = np.concatenate((np.expand_dims(self.w_history, 1), np.expand_dims(self.b_history, 1)), axis=1).T
self.path = np.concatenate((np.expand_dims(self.b_history, 1), np.expand_dims(self.w_history, 1)), axis=1).T
LinearRegression
class¶w_init = 2.0
b_init = -15.0
lr = 0.37
model1 = LinearRegression(data_x, data_y, 'sgd', w_init=w_init, b_init=b_init, learning_rate=lr)
model2 = LinearRegression(data_x, data_y, 'momentum', w_init=w_init, b_init=b_init, learning_rate=lr, momentum=0.2)
model3 = LinearRegression(data_x, data_y, 'adagrad', w_init=w_init, b_init=b_init, learning_rate=lr*2)
model4 = LinearRegression(data_x, data_y, 'rmsprop', w_init=w_init, b_init=b_init, learning_rate=lr*2)
model5 = LinearRegression(data_x, data_y, 'adam', w_init=w_init, b_init=b_init, learning_rate=lr*2)
w_init: 2.000 b_init: -15.000 w_init: 2.000 b_init: -15.000 w_init: 2.000 b_init: -15.000 w_init: 2.000 b_init: -15.000 w_init: 2.000 b_init: -15.000
%time
max_epochs = 50
model1.train(max_epochs)
CPU times: user 14 µs, sys: 1 µs, total: 15 µs Wall time: 21 µs epochs: 0 w: 2.00000 b: -15.00000 epochs: 1 loss: 121.202918 w: 9.00188 b: -9.75108 dw: -18.92400 db: -14.18627 uw: 7.00188 ub: 5.24892 epochs: 11 loss: 11.661103 w: 4.77643 b: -2.29243 dw: -3.51170 db: -0.79135 uw: 1.29933 ub: 0.29280 epochs: 21 loss: 10.376291 w: 4.25610 b: -2.22824 dw: -0.60596 db: -0.10511 uw: 0.22420 ub: 0.03889 epochs: 31 loss: 10.339332 w: 4.17082 b: -2.23970 dw: -0.10378 db: -0.01742 uw: 0.03840 ub: 0.00645 epochs: 41 loss: 10.338250 w: 4.15630 b: -2.24208 dw: -0.01776 db: -0.00297 uw: 0.00657 ub: 0.00110
%time
model2.train(max_epochs)
CPU times: user 5 µs, sys: 6 µs, total: 11 µs Wall time: 18.8 µs epochs: 0 w: 2.00000 b: -15.00000 epochs: 1 loss: 121.202918 w: 9.00188 b: -9.75108 dw: -18.92400 db: -14.18627 uw: 7.00188 ub: 5.24892 epochs: 11 loss: 10.338239 w: 4.15385 b: -2.24344 dw: -0.01092 db: -0.00565 uw: 0.00217 ub: 0.00371
%time
model3.train(max_epochs)
CPU times: user 5 µs, sys: 0 ns, total: 5 µs Wall time: 11.2 µs epochs: 0 w: 2.00000 b: -15.00000 epochs: 1 loss: 121.202918 w: 2.73990 b: -14.26018 dw: -18.92400 db: -14.18627 uw: 0.73990 ub: 0.73982 epochs: 11 loss: 51.619360 w: 4.76537 b: -11.43049 dw: -3.63806 db: -9.01757 uw: 0.08098 ub: 0.18206 epochs: 21 loss: 38.584575 w: 5.12483 b: -9.99936 dw: -0.56888 db: -7.24088 uw: 0.01248 ub: 0.12049 epochs: 31 loss: 31.436343 w: 5.13047 b: -8.96075 dw: 0.25374 db: -6.15879 uw: -0.00556 ub: 0.09272 epochs: 41 loss: 26.537054 w: 5.04772 b: -8.13209 dw: 0.43182 db: -5.36556 uw: -0.00946 ub: 0.07583
%time
model4.train(max_epochs)
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs Wall time: 6.91 µs epochs: 0 w: 2.00000 b: -15.00000 epochs: 1 loss: 121.202918 w: 4.34009 b: -12.65991 dw: -18.92400 db: -14.18627 uw: 2.34009 ub: 2.34009 epochs: 11 loss: 15.425745 w: 4.61289 b: -5.15518 dw: 0.35077 db: -2.99011 uw: -0.06906 ub: 0.42829 epochs: 21 loss: 10.500984 w: 4.22766 b: -2.71807 dw: 0.05745 db: -0.53566 uw: -0.01913 ub: 0.12220 epochs: 31 loss: 10.338585 w: 4.15621 b: -2.26117 dw: 0.00272 db: -0.02545 uw: -0.00153 ub: 0.00981
%time
model5.train(max_epochs)
CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs Wall time: 11.9 µs epochs: 0 w: 2.00000 b: -15.00000 epochs: 1 loss: 121.202918 w: 2.74000 b: -14.26000 dw: -18.92400 db: -14.18627 uw: 0.74000 ub: 0.74000 epochs: 11 loss: 34.044398 w: 6.74071 b: -7.39225 dw: 9.16197 db: -3.98266 uw: -0.08532 ub: 0.60650 epochs: 21 loss: 10.807880 w: 3.74512 b: -2.70585 dw: -1.25564 db: -0.91920 uw: -0.26199 ub: 0.35891 epochs: 31 loss: 11.952498 w: 3.67744 b: -0.47402 dw: -1.84514 db: 1.26399 uw: 0.13179 ub: 0.10134 epochs: 41 loss: 11.548322 w: 4.21616 b: -0.90209 dw: 1.47482 db: 1.54073 uw: -0.03959 ub: -0.13227
print("Real parameters used creating the data")
print("w: {:.4f} b: {:.4f}".format(a, b))
print("Exact Solution using the normal equation")
print("w: {:.4f} b: {:.4f}".format(W_exact[0], W_exact[1]))
print("Solution using the gradient descent")
print("w: {:.4f} b: {:.4f}".format(model1.w, model1.b))
print("Solution using the gradient descent with momentum")
print("w: {:.4f} b: {:.4f}".format(model2.w, model2.b))
print("Solution using the gradient descent with adagrad")
print("w: {:.4f} b: {:.4f}".format(model3.w, model3.b))
print("Solution using the gradient descent with rmsprop")
print("w: {:.4f} b: {:.4f}".format(model4.w, model4.b))
print("Solution using the gradient descent with adam")
print("w: {:.4f} b: {:.4f}".format(model5.w, model5.b))
Real parameters used creating the data w: 4.0000 b: -2.0000 Exact Solution using the normal equation w: 4.1533 b: -2.2426 Solution using the gradient descent w: 4.1540 b: -2.2425 Solution using the gradient descent with momentum w: 4.1531 b: -2.2424 Solution using the gradient descent with adagrad w: 4.9612 b: -7.5061 Solution using the gradient descent with rmsprop w: 4.1534 b: -2.2429 Solution using the gradient descent with adam w: 3.9050 b: -2.1469
#Plot the loss function
plt.title('Loss Function L')
plt.xlabel('Number of epochs')
plt.ylabel('Loss')
plt.plot(model1.loss_history, label='gradient descent')
plt.plot(model2.loss_history, label='momentum')
plt.plot(model3.loss_history, label='adagrad')
plt.plot(model4.loss_history, label='rmsprop')
plt.plot(model5.loss_history, label='adam')
plt.legend()
plt.show()
plt.plot(data_x, data_y, 'ro', label='Real data')
plt.plot(data_x, model1.w * data_x + model1.b, lw=5, label='gradient descent')
plt.plot(data_x, model2.w * data_x + model2.b, lw=5, label='momentum')
plt.plot(data_x, model3.w * data_x + model3.b, lw=5, label='adagrad')
plt.plot(data_x, model4.w * data_x + model4.b, lw=5, label='rmsprop')
plt.plot(data_x, model5.w * data_x + model5.b, lw=5, label='adam')
plt.axhline(0, color='black', lw=1)
plt.axvline(0, color='black', lw=1)
plt.legend()
plt.show()
# putting together our points to plot in a 3D plot
number_of_points = 50
w_margin = 9.
b_margin = 16.
w_min = a - w_margin
w_max = a + w_margin
b_min = b - b_margin
b_max = b + b_margin
w_points = np.linspace(w_min, w_max, number_of_points)
b_points = np.linspace(b_min, b_max, number_of_points)
w_mesh, b_mesh = np.meshgrid(w_points, b_points)
loss_ = np.array([model1.loss_for_plot(wps, bps) for wps, bps in zip(w_mesh, b_mesh)])
#%matplotlib inline
#%matplotlib notebook
#%pylab
path1 = model1.path
path2 = model2.path
path3 = model3.path
path4 = model4.path
path5 = model5.path
fig = plt.figure(figsize=(10, 8))
ax = plt.axes(projection='3d', elev=30, azim=-60)
ax.plot_surface(b_mesh, w_mesh, loss_, norm=LogNorm(), rstride=1, cstride=1,
edgecolor='none', alpha=.8, cmap=plt.cm.jet)
ax.plot(*minima, model1.loss_for_plot(*np.flip(minima)), 'r*', markersize=20)
#ax.quiver(path1[0,:-1], path1[1,:-1], model1.loss_for_plot(*path1[::,:-1]),
# path1[0,1:]-path1[0,:-1], path1[1,1:]-path1[1,:-1],
# model1.loss_for_plot(*path1[::,1:]) - model1.loss_for_plot(*path1[::,:-1]),
# color='k', length=0.5, normalize=True)
ax.set_xlabel('b')
ax.set_ylabel('w')
ax.set_zlabel('loss')
ax.set_xlim((b_min, b_max))
ax.set_ylim((w_min, w_max))
#plt.draw()
plt.show()
import matplotlib.pylab as pylab
#params = {'legend.fontsize': 'x-large',
# 'axes.labelsize': 'x-large',
# 'axes.titlesize':'x-large',
# 'xtick.labelsize':'x-large',
# 'ytick.labelsize':'x-large'}
size = 25
params = {'legend.fontsize': size,
'axes.labelsize': size,
'axes.titlesize': size,
'xtick.labelsize': size,
'ytick.labelsize': size}
pylab.rcParams.update(params)
fig, ax = plt.subplots(figsize=(16, 9))
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']
ax.contour(b_mesh, w_mesh, loss_, levels=np.logspace(-1, 3, 45), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima, 'r*', markersize=20)
ax.quiver(path1[0,:-1], path1[1,:-1], path1[0,1:]-path1[0,:-1], path1[1,1:]-path1[1,:-1],
scale_units='xy', angles='xy', scale=1.0, color=colors[0])
ax.quiver(path2[0,:-1], path2[1,:-1], path2[0,1:]-path2[0,:-1], path2[1,1:]-path2[1,:-1],
scale_units='xy', angles='xy', scale=1.0, color=colors[1])
ax.quiver(path3[0,:-1], path3[1,:-1], path3[0,1:]-path3[0,:-1], path3[1,1:]-path3[1,:-1],
scale_units='xy', angles='xy', scale=1.0, color=colors[2])
ax.quiver(path4[0,:-1], path4[1,:-1], path4[0,1:]-path4[0,:-1], path4[1,1:]-path4[1,:-1],
scale_units='xy', angles='xy', scale=1.0, color=colors[3])
ax.quiver(path5[0,:-1], path5[1,:-1], path5[0,1:]-path5[0,:-1], path5[1,1:]-path5[1,:-1],
scale_units='xy', angles='xy', scale=1.0, color=colors[4])
ax.set_xlabel('b')
ax.set_ylabel('w')
ax.set_xlim((b_min, b_max))
ax.set_ylim((w_min, w_max))
plt.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.1)
plt.savefig('regression.all.plot.jpg')
plt.show()
class TrajectoryAnimation(animation.FuncAnimation):
def __init__(self, *paths, labels=[], colors=[], fig=None, ax=None, frames=None,
interval=60, repeat_delay=5, blit=True, **kwargs):
if fig is None:
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.get_figure()
else:
if ax is None:
ax = fig.gca()
self.fig = fig
self.ax = ax
self.paths = paths
if frames is None:
frames = max(path.shape[1] for path in paths)
self.lines = [ax.plot([], [], label=label, color=color, lw=3)[0]
for _, label, color in zip_longest(paths, labels, colors)]
self.points = [ax.plot([], [], 'o', color=line.get_color())[0]
for line in self.lines]
super(TrajectoryAnimation, self).__init__(fig, self.animate, init_func=self.init_anim,
frames=frames, interval=interval, blit=blit,
repeat_delay=repeat_delay, **kwargs)
def init_anim(self):
for line, point in zip(self.lines, self.points):
line.set_data([], [])
point.set_data([], [])
return self.lines + self.points
def animate(self, i):
for line, point, path in zip(self.lines, self.points, self.paths):
line.set_data(*path[::,:i])
point.set_data(*path[::,i-1:i])
return self.lines + self.points
fig, ax = plt.subplots(figsize=(16, 9))
ax.contour(b_mesh, w_mesh, loss_, levels=np.logspace(-1, 3, 45), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima, 'r*', markersize=20)
ax.set_xlabel('b')
ax.set_ylabel('w')
ax.set_xlim((b_min, b_max))
ax.set_ylim((w_min, w_max))
all_pathes = [path1, path2, path3, path4, path5]
opt_names = ['gradient descent', 'momentum', 'adagrad', 'rmsprop', 'adam']
#path_animation = TrajectoryAnimation(*all_pathes, labels=opt_names, colors=colors, ax=ax)
path_animation = TrajectoryAnimation(*all_pathes, labels=opt_names, colors=colors, ax=ax, interval=600)
ax.legend(loc='upper left', prop={'size': 15})
<matplotlib.legend.Legend at 0x11af849b0>
HTML(path_animation.to_html5_video())
path_animation.save('regression.allplot.animation.mp4')