RLN keras tutorial

This is a quick tutorial of the use of the Keras RLN implementation.
First, let's import and create the train and test set. In this tutorial, we're using the Boston housing price regression dataset, with additional noise features.

In [72]:
import numpy as np
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from keras import regularizers
from Keras_implementation import RLNCallback
from keras.datasets import boston_housing
from keras.models import Sequential
from keras.layers import Dense
from numpy.random import seed
from keras.backend import eval as keras_eval
import warnings
warnings.filterwarnings("ignore")

(x_train, y_train), (x_test, y_test) = boston_housing.load_data()

# Add noisy features
noise_features = 1000
x_train = np.concatenate([x_train, np.random.normal(size=(x_train.shape[0], noise_features))], axis=1)
x_test = np.concatenate([x_test, np.random.normal(size=(x_test.shape[0], noise_features))], axis=1)

# Scale features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train, y_train)
x_test = scaler.transform(x_test, y_test)

INPUT_DIM = x_train.shape[1]

Let's create a basic Keras model and a test function:

In [ ]:
def base_model(layers=4, l1=0):
    assert layers > 1

    def build_fn():
        inner_l1 = l1
        # create model
        model = Sequential()
        # Construct the layers of the model to form a geometric series         
        prev_width = INPUT_DIM
        for width in np.exp(np.log(INPUT_DIM) * np.arange(layers - 1, 0, -1) / layers):
            width = int(np.round(width))
            model.add(Dense(width, input_dim=prev_width, kernel_initializer='glorot_normal', activation='relu', 
                            kernel_regularizer=regularizers.l1(inner_l1)))
            # For efficiency we only regularized the first layer            
            inner_l1 = 0
            prev_width = width
            
        model.add(Dense(1, kernel_initializer='glorot_normal'))
        
        # Compile model
        model.compile(loss='mean_squared_error', optimizer='rmsprop')
        return model
    return build_fn

MJTCP = 32292 # Michael Jordan total career points

def test_model(build_fn, modle_name, num_repeates=10):
    seed(MJTCP)
    results = np.zeros(num_repeates)
    for i in range(num_repeates):
        reg = KerasRegressor(build_fn=build_fn, epochs=100, batch_size=10, verbose=0)
        reg.fit(x_train, y_train)
        results[i] = reg.score(x_test, y_test)
    print("%s: %.2f (%.2f) MSE" % (modle_name, results.mean(), results.std()))
    return results.mean()

Let's optimize the depth and L1 regularization of the network:

In [76]:
layers = 1

prev_score = np.inf
cur_score = None

while (cur_score < prev_score) or (prev_score is None):
    prev_score = cur_score
    layers += 1
    cur_score = test_model(base_model(layers=layers), "Network with %d layers" % layers)
    
layers -= 1
print "The best results of an unregularized network are achieved with depth %d" %layers
Network with 2 layers: 136.02 (5.17) MSE
Network with 3 layers: 112.52 (5.58) MSE
Network with 4 layers: 106.84 (10.62) MSE
Network with 5 layers: 135.84 (103.32) MSE
The best results of an unregularized network are achieved with depth 4
In [114]:
l1 = 0.001

prev_score = np.inf
cur_score = None

while cur_score < prev_score or prev_score is None:
    prev_score = cur_score
    l1 *= 10
    cur_score = test_model(base_model(layers=layers, l1=l1), "L1 regularization of %.0E" % l1)

best_l1_score = prev_score

l1 /= 10
print "The best L1 regularization is achieved with l1 = %.0E" % l1 
L1 regularization of 1E-02: 53.05 (6.07) MSE
L1 regularization of 1E-01: 48.57 (4.31) MSE
L1 regularization of 1E+00: 144.00 (9.74) MSE
The best L1 regularization is achieved with l1 = 1E-01

The Keras RLN implementation uses callbacks to change the weights of the layer. The callback gets the specific layer it regularizes as a parameter, and is passed to the fit function:

In [158]:
def RLN(layers=4, **rln_kwargs):
    def build_fn():
        model = base_model(layers=layers)()
        
        # For efficiency we only regularized the first layer
        rln_callback = RLNCallback(model.layers[0], **rln_kwargs)

        # Change the fit function of the model to except rln_callback:
        orig_fit = model.fit
        def rln_fit(*args, **fit_kwargs):
            orig_callbacks = fit_kwargs.get('callbacks', [])
            rln_callbacks = orig_callbacks + [rln_callback]
            return orig_fit(*args, callbacks=rln_callbacks, **fit_kwargs)

        model.fit = rln_fit

        return model

    return build_fn

Applying RLN on this dataset to find the optimal average regularization (Theta) and learning rate (nu).

  • The average regularization (Theta) is in the log scale, while the regularization of Keras (l1) is not.
  • RLNs tend to require much smaller average regularization, and typically we have that exp(Theta) << l1
  • The learning rate (nu) is a very important parameter that can have dramatic effects on the performance of the network. It is very important to tune it well.
  • Because we optimize very small coefficients in the log scale, the gradients tend to be quite small, so a large learning rate is required.
In [144]:
best_rln_score = np.inf
Theta, learning_rate = None, None

for cur_Theta, log_learning_rate in [(-8, 6), (-10, 5), (-10, 6), (-10, 7), (-12, 6)]:
    cur_learning_rate = np.power(10, log_learning_rate)
    cur_score = test_model(RLN(layers=layers, norm=1, avg_reg=cur_Theta, learning_rate=cur_learning_rate), 
                           "RLN with Theta=%s and learning_rate=%.1E" % (cur_Theta, cur_learning_rate))
    if cur_score < best_rln_score:
        Theta, learning_rate = cur_Theta, cur_learning_rate
        best_rln_score = cur_score

print "The best RLN is achieved with Theta=%d and learning_rate=%.1E" % (Theta, learning_rate)
RLN with Theta=-8 and learning_rate=1.0E+06: 38.91 (2.31) MSE
RLN with Theta=-10 and learning_rate=1.0E+05: 34.60 (4.27) MSE
RLN with Theta=-10 and learning_rate=1.0E+06: 21.34 (2.37) MSE
RLN with Theta=-10 and learning_rate=1.0E+07: 99.58 (8.51) MSE
RLN with Theta=-12 and learning_rate=1.0E+06: 86.83 (4.99) MSE
The best RLN is achieved with Theta=-10 and learning_rate=1.0E+06
In [174]:
print "We see that RLN outperforms L1 regularization on this dataset, %.2f < %.2f" %(best_rln_score, best_l1_score)
print "We also see that the average regularization required in RLN is much smaller than required in L1 regularized models:"
print "%.1E << %.1E" % (np.exp(Theta), l1)
We see that RLN outperforms L1 regularization on this dataset, 21.34 < 48.57
We also see that the average regularization required in RLN is much smaller than required in L1 regularized models:
4.5E-05 << 1.0E-01