%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize as spo
a_0 = 1.2
a_1 = 10.3
a_2 = 5
def non_convex_f(x, y):
res_pos = (a_0 * x ** 2) + (a_1 * y ** 2) + (a_2 * x * y)
res_neg = (a_1 * x ** 2) + (a_0 * y ** 2) + (a_2 * x * y)
return np.where(x > 0, res_pos, res_neg)
def non_convex_df(w):
x, y = w[0], w[1]
if x > 0:
return np.array([(a_0 * 2 * x) + (a_2 * y), (a_1 * 2 * y) + (a_2 * x)])
else:
return np.array([(a_1 * 2 * x) + (a_2 * y), (a_0 * 2 * y) + (a_2 * x)])
def plot_background(f, x_space, y_space):
xx, yy = np.meshgrid(x_space, y_space)
extent = [min(x_space), max(x_space), min(y_space), max(y_space)]
objective = non_convex_f(xx, yy)
plt.imshow(objective, extent=extent, cmap=plt.cm.Reds, origin='lower')
plt.contour(objective, extent=extent, cmap=plt.cm.Reds)
def plot_convergence(x, y, df, n_iter=10, color='b', step=None, tol=1e-5, **kwargs):
ctx = {}
for i in range(n_iter):
(new_x, new_y), change = step(np.array([x, y]), df, ctx, **kwargs)
if change < tol:
print("%s converged in %03d steps" % (step.__name__, i + 1))
break
plt.arrow(x, y, new_x - x, new_y - y,
width=0.001, length_includes_head=True, alpha=0.2, color=color)
plt.text(x, y, i, fontsize=12, color=color, clip_on=True,
horizontalalignment='center',
verticalalignment='center')
x, y = new_x, new_y
if i == n_iter -1:
print("%s did not converged in %03d steps" % (step.__name__, i + 1))
print(x, y)
def gradient_step(w, df, ctx, step_size=0.1):
d = df(w)
update = - step_size * d
ctx['update'] = update
ctx['gradient'] = d
return w + update, np.abs(update).max()
def moment_step(w, df, ctx, step_size=0.1, moment=0.9):
update = ctx.get('update', np.zeros_like(w))
d = df(w)
update = moment * update - step_size * d
ctx['update'] = update
ctx['gradient'] = d
return w + update, np.abs(update).max()
def nesterov_moment_step(w, df, ctx, step_size=0.1, moment=0.9):
update = ctx.get('update', np.zeros_like(w))
d = df(w + moment * update)
update = moment * update - step_size * d
ctx['update'] = update
ctx['gradient'] = d
return w + update, np.abs(update).max()
ada_rate_inc = 1.5
ada_rate_dec = ada_rate_inc * 2
def adasign_step(w, df, ctx, step_size=0.1):
iter_ = ctx.get('iter', 0)
step_size_ = ctx.get('step_size', np.ones_like(w) * step_size)
update = ctx.get('update', np.zeros_like(w))
prev_d = ctx.get('gradient', np.zeros_like(w))
d = df(w)
if iter_ > 0:
# Prevent exploding gradients
eps = np.finfo(prev_d.dtype).eps
rates = (np.abs(d) + eps) / (np.abs(prev_d) + eps)
exploding = rates > 10
step_size_ = np.where(exploding, step_size_ / rates, step_size_)
increased = step_size_ * ada_rate_inc
increased[increased > 1] = 1.
decreased = step_size / ada_rate_dec
step_size_ = np.where(np.logical_and(d * prev_d > 0, ~exploding), increased, decreased)
update = - step_size_ * d
ctx['update'] = update
ctx['gradient'] = d
ctx['step_size'] = step_size_
ctx['iter'] = iter_ + 1
return w + update, np.abs(update).max()
def adasign_scalar_step(w, df, ctx, step_size=0.1):
iter_ = ctx.get('iter', 0)
step_size_ = ctx.get('step_size', step_size)
update = ctx.get('update', np.zeros_like(w))
prev_d = ctx.get('gradient', np.zeros_like(w))
d = df(w)
if iter_ > 0:
if np.all(d * prev_d > 0):
step_size_ = step_size_ * ada_rate_inc
else:
d_norm = np.linalg.norm(d)
prev_d_norm = np.linalg.norm(prev_d)
exploding = False
if d_norm > 0 and prev_d_norm > 0:
rate = d_norm / prev_d_norm
# Try to counter balance exploding gradients
if rate > 10:
exploding = True
step_size_ /= rate
if not exploding:
step_size_ = step_size_ / ada_rate_dec
update = - step_size_ * d
ctx['update'] = update
ctx['gradient'] = d
ctx['step_size'] = step_size_
ctx['iter'] = iter_ + 1
return w + update, np.abs(update).max()
n_iter = 100
step_size = 0.01
moment = 0.8
x_min, x_max = -3., 3.
y_min, y_max = x_min, x_max
x_space = np.linspace(x_min, x_max, 100)
y_space = np.linspace(y_max, y_min, 100)
x0 = np.random.uniform(low=x_min, high=x_max)
y0 = np.random.uniform(low=y_min, high=y_max)
# x0, y0 = -3., -3.
print("start position:")
print(x0, y0)
plt.figure(figsize=(12, 12))
plt.grid()
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
# df = spo.rosen_der
df = non_convex_df
plot_background(non_convex_f, x_space, y_space)
plot_convergence(x0, y0, df, n_iter=n_iter, color='r', step=gradient_step, step_size=step_size)
plot_convergence(x0, y0, df, n_iter=n_iter, color='g', step=moment_step, step_size=step_size, moment=moment)
plot_convergence(x0, y0, df, n_iter=n_iter, color='b', step=nesterov_moment_step, step_size=step_size, moment=moment)
plot_convergence(x0, y0, df, n_iter=n_iter, color='m', step=adasign_step, step_size=step_size)
plot_convergence(x0, y0, df, n_iter=n_iter, color='c', step=adasign_scalar_step, step_size=step_size)
start position: 0.4963600454707464 -2.907575569792645 gradient_step did not converged in 100 steps 0.379177446555 -0.0973090242079 moment_step did not converged in 100 steps 0.000182893980843 -4.66895585568e-05 nesterov_moment_step did not converged in 100 steps 0.00063155206412 -0.000162076224797 adasign_step converged in 042 steps -5.66878709981e-05 0.000243244310997 adasign_scalar_step converged in 072 steps 9.48637257609e-05 -2.77119577602e-05
from sklearn.datasets import load_digits
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer
digits = load_digits()
X, y = shuffle(digits.data, digits.target, random_state=0)
X = MinMaxScaler().fit_transform(X)
Y = LabelBinarizer(neg_label=-1, pos_label=1).fit_transform(y)
class NN(object):
def __init__(self, n_nodes=(64, 100, 100, 100, 10), batch_size=100, seed=1):
rng = np.random.RandomState(seed)
self.weights = []
self.intercepts = []
self.activations = []
for i in range(len(n_nodes) - 1):
self.weights.append(rng.randn(n_nodes[i], n_nodes[i + 1]))
self.intercepts.append(rng.randn(n_nodes[i + 1]) + 0.01)
self.activations.append(np.empty(shape=(batch_size, n_nodes[i + 1]), dtype=np.float64))
def forward(self, data):
for i, activation in enumerate(self.activations):
data = np.dot(data, self.weights[i])
data += self.intercepts[i]
np.clip(data, 0, np.finfo(data.dtype).max, out=data) # inplace relu
activation[:] = data
return data
def loss(self, data, target):
"""OvA squared hinge loss"""
predicted = self.forward(data)
# target is expected to be binarized in range {-1, 1}
return np.clip(1 - predicted * target, 0, np.finfo(data.dtype).max) ** 2
nn = NN(batch_size=3)
nn.forward(X[:3])
array([[ 0. , 627.64539123, 0. , 892.07000028, 0. , 0. , 797.73018565, 0. , 1719.56396912, 0. ], [ 0. , 855.95058061, 0. , 324.41462942, 0. , 0. , 563.53507541, 0. , 529.90421564, 0. ], [ 0. , 862.76249815, 0. , 1243.34652193, 0. , 0. , 1127.12543415, 0. , 1921.21407034, 0. ]])
nn.loss(X[:3], Y[:3])
array([[ 1.00000000e+00, 3.95195028e+05, 1.00000000e+00, 7.97574025e+05, 1.00000000e+00, 1.00000000e+00, 6.37969909e+05, 1.00000000e+00, 2.96034037e+06, 1.00000000e+00], [ 1.00000000e+00, 7.34364298e+05, 1.00000000e+00, 1.05894681e+05, 1.00000000e+00, 1.00000000e+00, 3.18699851e+05, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00], [ 1.00000000e+00, 7.46085653e+05, 1.00000000e+00, 1.54839827e+06, 1.00000000e+00, 1.00000000e+00, 1.27266700e+06, 1.00000000e+00, 3.69490693e+06, 1.00000000e+00]])
def loss(nn, data, target, w_x, w_y, x_idx=(1, 0, 0), y_idx=(3, 1, 1)):
results = []
for w_x_i, w_y_i in zip(w_x.ravel(), w_y.ravel()):
nn.weights[x_idx[0]][x_idx[1:2]] = w_x_i
nn.weights[y_idx[0]][y_idx[1:2]] = w_y_i
results.append(nn.loss(data, target).sum())
return np.array(results).reshape(w_x.shape)
from sklearn.cross_validation import train_test_split
batch_size = 50
nn = NN(batch_size=batch_size)
radius = 50
resolution = 20
w_x_space = np.linspace(-radius, radius, resolution)
w_y_space = np.linspace(-radius, radius, resolution)
w_xx, w_yy = np.meshgrid(w_x_space, w_y_space)
extent = [min(w_x_space), max(w_x_space), min(w_y_space), max(w_y_space)]
plt.figure(figsize=(16, 4))
n_plots = 8
for i in range(n_plots):
plt.subplot(1, n_plots, i + 1)
X_batch, _, Y_batch, _ = train_test_split(X, Y, train_size=batch_size,
random_state=i)
plt.grid()
objective = loss(nn, X_batch, Y_batch, w_xx, w_yy)
plt.imshow(objective, extent=extent, cmap=plt.cm.Reds, origin='lower')
plt.contour(objective, extent=extent, cmap=plt.cm.Reds)
ax = plt.gca()
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.yaxis.set_major_formatter(plt.NullFormatter())