Notebook

In [27]:

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize as spo

In [28]:

a_0 = 1.2
a_1 = 10.3
a_2 = 5

def non_convex_f(x, y):
    res_pos = (a_0 * x ** 2) + (a_1 * y ** 2) + (a_2 * x * y)
    res_neg = (a_1 * x ** 2) + (a_0 * y ** 2) + (a_2 * x * y)
    return np.where(x > 0, res_pos, res_neg)


def non_convex_df(w):
    x, y = w[0], w[1]
    if x > 0:
        return np.array([(a_0 * 2 * x) + (a_2 * y), (a_1 * 2 * y) + (a_2 * x)])
    else:
        return np.array([(a_1 * 2 * x) + (a_2 * y), (a_0 * 2 * y) + (a_2 * x)])

In [35]:

def plot_background(f, x_space, y_space):
    xx, yy = np.meshgrid(x_space, y_space)
    extent = [min(x_space), max(x_space), min(y_space), max(y_space)]
    objective = non_convex_f(xx, yy)
    plt.imshow(objective, extent=extent, cmap=plt.cm.Reds, origin='lower')
    plt.contour(objective, extent=extent, cmap=plt.cm.Reds)


def plot_convergence(x, y, df, n_iter=10, color='b', step=None, tol=1e-5, **kwargs):
    ctx = {}
    for i in range(n_iter):
        (new_x, new_y), change = step(np.array([x, y]), df, ctx, **kwargs)
        if change < tol:
            print("%s converged in %03d steps" % (step.__name__, i + 1))
            break
        plt.arrow(x, y, new_x - x, new_y - y,
                  width=0.001, length_includes_head=True, alpha=0.2, color=color)
        plt.text(x, y, i, fontsize=12, color=color, clip_on=True,
                 horizontalalignment='center',
                 verticalalignment='center')
        x, y = new_x, new_y
    if i == n_iter -1:
        print("%s did not converged in %03d steps" % (step.__name__, i + 1))
    print(x, y)

In [36]:

def gradient_step(w, df, ctx, step_size=0.1):
    d = df(w)
    update = - step_size * d
    ctx['update'] = update
    ctx['gradient'] = d
    return w + update, np.abs(update).max()


def moment_step(w, df, ctx, step_size=0.1, moment=0.9):
    update = ctx.get('update', np.zeros_like(w))
    d = df(w)
    update = moment * update - step_size * d
    ctx['update'] = update
    ctx['gradient'] = d
    return w + update, np.abs(update).max()


def nesterov_moment_step(w, df, ctx, step_size=0.1, moment=0.9):
    update = ctx.get('update', np.zeros_like(w))
    d = df(w + moment * update)
    update = moment * update - step_size * d
    ctx['update'] = update
    ctx['gradient'] = d
    return w + update, np.abs(update).max()


ada_rate_inc = 1.5
ada_rate_dec = ada_rate_inc * 2

def adasign_step(w, df, ctx, step_size=0.1):
    iter_ = ctx.get('iter', 0)
    step_size_ = ctx.get('step_size', np.ones_like(w) * step_size)
    update = ctx.get('update', np.zeros_like(w))
    prev_d = ctx.get('gradient', np.zeros_like(w))
    d = df(w)
    if iter_ > 0:
        # Prevent exploding gradients
        eps = np.finfo(prev_d.dtype).eps
        rates = (np.abs(d) + eps) / (np.abs(prev_d) + eps)
        exploding = rates > 10
        step_size_ = np.where(exploding, step_size_ / rates, step_size_)
        
        increased = step_size_ * ada_rate_inc
        increased[increased > 1] = 1.
        decreased = step_size / ada_rate_dec
        step_size_ = np.where(np.logical_and(d * prev_d > 0, ~exploding), increased, decreased)

    update = - step_size_ * d
    ctx['update'] = update
    ctx['gradient'] = d
    ctx['step_size'] = step_size_
    ctx['iter'] = iter_ + 1
    return w + update, np.abs(update).max()


def adasign_scalar_step(w, df, ctx, step_size=0.1):
    iter_ = ctx.get('iter', 0)
    step_size_ = ctx.get('step_size', step_size)
    update = ctx.get('update', np.zeros_like(w))
    prev_d = ctx.get('gradient', np.zeros_like(w))
    d = df(w)
    if iter_ > 0:
        if np.all(d * prev_d > 0):
            step_size_ = step_size_ * ada_rate_inc
        else:
            d_norm = np.linalg.norm(d)
            prev_d_norm = np.linalg.norm(prev_d)
            exploding = False
            if d_norm > 0 and prev_d_norm > 0:
                rate = d_norm / prev_d_norm
                # Try to counter balance exploding gradients
                if rate > 10:
                    exploding = True
                    step_size_ /= rate
            if not exploding:
                step_size_ = step_size_ / ada_rate_dec
    
    update = - step_size_ * d
    ctx['update'] = update
    ctx['gradient'] = d
    ctx['step_size'] = step_size_
    ctx['iter'] = iter_ + 1
    return w + update, np.abs(update).max()

In [43]:

n_iter = 100
step_size = 0.01
moment = 0.8

x_min, x_max = -3., 3.
y_min, y_max = x_min, x_max 
x_space = np.linspace(x_min, x_max, 100)
y_space = np.linspace(y_max, y_min, 100)
x0 = np.random.uniform(low=x_min, high=x_max)
y0 = np.random.uniform(low=y_min, high=y_max)
# x0, y0 = -3., -3.
print("start position:")
print(x0, y0)


plt.figure(figsize=(12, 12))
plt.grid()
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# df = spo.rosen_der
df = non_convex_df

plot_background(non_convex_f, x_space, y_space)
plot_convergence(x0, y0, df, n_iter=n_iter, color='r', step=gradient_step, step_size=step_size)
plot_convergence(x0, y0, df, n_iter=n_iter, color='g', step=moment_step, step_size=step_size, moment=moment)
plot_convergence(x0, y0, df, n_iter=n_iter, color='b', step=nesterov_moment_step, step_size=step_size, moment=moment)
plot_convergence(x0, y0, df, n_iter=n_iter, color='m', step=adasign_step, step_size=step_size)
plot_convergence(x0, y0, df, n_iter=n_iter, color='c', step=adasign_scalar_step, step_size=step_size)

start position:
0.4963600454707464 -2.907575569792645
gradient_step did not converged in 100 steps
0.379177446555 -0.0973090242079
moment_step did not converged in 100 steps
0.000182893980843 -4.66895585568e-05
nesterov_moment_step did not converged in 100 steps
0.00063155206412 -0.000162076224797
adasign_step converged in 042 steps
-5.66878709981e-05 0.000243244310997
adasign_scalar_step converged in 072 steps
9.48637257609e-05 -2.77119577602e-05

In [38]:

from sklearn.datasets import load_digits
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer

digits = load_digits()
X, y = shuffle(digits.data, digits.target, random_state=0)
X = MinMaxScaler().fit_transform(X)
Y = LabelBinarizer(neg_label=-1, pos_label=1).fit_transform(y)


class NN(object):
    
    def __init__(self, n_nodes=(64, 100, 100, 100, 10), batch_size=100, seed=1):
        rng = np.random.RandomState(seed)
        self.weights = []
        self.intercepts = []
        self.activations = []
        for i in range(len(n_nodes) - 1):
            self.weights.append(rng.randn(n_nodes[i], n_nodes[i + 1]))
            self.intercepts.append(rng.randn(n_nodes[i + 1]) + 0.01)
            self.activations.append(np.empty(shape=(batch_size, n_nodes[i + 1]), dtype=np.float64))
    
    def forward(self, data):
        for i, activation in enumerate(self.activations):
            data = np.dot(data, self.weights[i])
            data += self.intercepts[i]
            np.clip(data, 0, np.finfo(data.dtype).max, out=data)  # inplace relu
            activation[:] = data
        return data
    
    def loss(self, data, target):
        """OvA squared hinge loss"""
        predicted = self.forward(data)
        # target is expected to be binarized in range {-1, 1}
        return np.clip(1 - predicted * target, 0, np.finfo(data.dtype).max) ** 2
        
nn = NN(batch_size=3)
nn.forward(X[:3])

Out[38]:

array([[    0.        ,   627.64539123,     0.        ,   892.07000028,
            0.        ,     0.        ,   797.73018565,     0.        ,
         1719.56396912,     0.        ],
       [    0.        ,   855.95058061,     0.        ,   324.41462942,
            0.        ,     0.        ,   563.53507541,     0.        ,
          529.90421564,     0.        ],
       [    0.        ,   862.76249815,     0.        ,  1243.34652193,
            0.        ,     0.        ,  1127.12543415,     0.        ,
         1921.21407034,     0.        ]])

In [39]:

nn.loss(X[:3], Y[:3])

Out[39]:

array([[  1.00000000e+00,   3.95195028e+05,   1.00000000e+00,
          7.97574025e+05,   1.00000000e+00,   1.00000000e+00,
          6.37969909e+05,   1.00000000e+00,   2.96034037e+06,
          1.00000000e+00],
       [  1.00000000e+00,   7.34364298e+05,   1.00000000e+00,
          1.05894681e+05,   1.00000000e+00,   1.00000000e+00,
          3.18699851e+05,   1.00000000e+00,   0.00000000e+00,
          1.00000000e+00],
       [  1.00000000e+00,   7.46085653e+05,   1.00000000e+00,
          1.54839827e+06,   1.00000000e+00,   1.00000000e+00,
          1.27266700e+06,   1.00000000e+00,   3.69490693e+06,
          1.00000000e+00]])

In [40]:

def loss(nn, data, target, w_x, w_y, x_idx=(1, 0, 0), y_idx=(3, 1, 1)):
    results = []
    for w_x_i, w_y_i in zip(w_x.ravel(), w_y.ravel()):
        nn.weights[x_idx[0]][x_idx[1:2]] = w_x_i
        nn.weights[y_idx[0]][y_idx[1:2]] = w_y_i
        results.append(nn.loss(data, target).sum())
    return np.array(results).reshape(w_x.shape)

In [47]:

from sklearn.cross_validation import train_test_split

batch_size = 50
nn = NN(batch_size=batch_size)

radius = 50
resolution = 20
w_x_space = np.linspace(-radius, radius, resolution)
w_y_space = np.linspace(-radius, radius, resolution)

w_xx, w_yy = np.meshgrid(w_x_space, w_y_space)
extent = [min(w_x_space), max(w_x_space), min(w_y_space), max(w_y_space)]

plt.figure(figsize=(16, 4))
n_plots = 8
for i in range(n_plots):
    plt.subplot(1, n_plots, i + 1)
    X_batch, _, Y_batch, _ = train_test_split(X, Y, train_size=batch_size,
                                              random_state=i)
    plt.grid()
    objective = loss(nn, X_batch, Y_batch, w_xx, w_yy)
    plt.imshow(objective, extent=extent, cmap=plt.cm.Reds, origin='lower')
    plt.contour(objective, extent=extent, cmap=plt.cm.Reds)
    ax = plt.gca()
    ax.xaxis.set_major_formatter(plt.NullFormatter())
    ax.yaxis.set_major_formatter(plt.NullFormatter())

In [95]:

In [ ]: