This is a quick tutorial of the use of the Keras RLN implementation.
First, let's import and create the train and test set. In this tutorial, we're using the Boston housing price regression dataset, with additional noise features.
import numpy as np from keras.wrappers.scikit_learn import KerasRegressor from sklearn.preprocessing import StandardScaler from keras import regularizers from Keras_implementation import RLNCallback from keras.datasets import boston_housing from keras.models import Sequential from keras.layers import Dense from numpy.random import seed from keras.backend import eval as keras_eval import warnings warnings.filterwarnings("ignore") (x_train, y_train), (x_test, y_test) = boston_housing.load_data() # Add noisy features noise_features = 1000 x_train = np.concatenate([x_train, np.random.normal(size=(x_train.shape, noise_features))], axis=1) x_test = np.concatenate([x_test, np.random.normal(size=(x_test.shape, noise_features))], axis=1) # Scale features scaler = StandardScaler() x_train = scaler.fit_transform(x_train, y_train) x_test = scaler.transform(x_test, y_test) INPUT_DIM = x_train.shape
Let's create a basic Keras model and a test function:
def base_model(layers=4, l1=0): assert layers > 1 def build_fn(): inner_l1 = l1 # create model model = Sequential() # Construct the layers of the model to form a geometric series prev_width = INPUT_DIM for width in np.exp(np.log(INPUT_DIM) * np.arange(layers - 1, 0, -1) / layers): width = int(np.round(width)) model.add(Dense(width, input_dim=prev_width, kernel_initializer='glorot_normal', activation='relu', kernel_regularizer=regularizers.l1(inner_l1))) # For efficiency we only regularized the first layer inner_l1 = 0 prev_width = width model.add(Dense(1, kernel_initializer='glorot_normal')) # Compile model model.compile(loss='mean_squared_error', optimizer='rmsprop') return model return build_fn MJTCP = 32292 # Michael Jordan total career points def test_model(build_fn, modle_name, num_repeates=10): seed(MJTCP) results = np.zeros(num_repeates) for i in range(num_repeates): reg = KerasRegressor(build_fn=build_fn, epochs=100, batch_size=10, verbose=0) reg.fit(x_train, y_train) results[i] = reg.score(x_test, y_test) print("%s: %.2f (%.2f) MSE" % (modle_name, results.mean(), results.std())) return results.mean()
Let's optimize the depth and L1 regularization of the network:
layers = 1 prev_score = np.inf cur_score = None while (cur_score < prev_score) or (prev_score is None): prev_score = cur_score layers += 1 cur_score = test_model(base_model(layers=layers), "Network with %d layers" % layers) layers -= 1 print "The best results of an unregularized network are achieved with depth %d" %layers
Network with 2 layers: 136.02 (5.17) MSE Network with 3 layers: 112.52 (5.58) MSE Network with 4 layers: 106.84 (10.62) MSE Network with 5 layers: 135.84 (103.32) MSE The best results of an unregularized network are achieved with depth 4
l1 = 0.001 prev_score = np.inf cur_score = None while cur_score < prev_score or prev_score is None: prev_score = cur_score l1 *= 10 cur_score = test_model(base_model(layers=layers, l1=l1), "L1 regularization of %.0E" % l1) best_l1_score = prev_score l1 /= 10 print "The best L1 regularization is achieved with l1 = %.0E" % l1
L1 regularization of 1E-02: 53.05 (6.07) MSE L1 regularization of 1E-01: 48.57 (4.31) MSE L1 regularization of 1E+00: 144.00 (9.74) MSE The best L1 regularization is achieved with l1 = 1E-01
The Keras RLN implementation uses callbacks to change the weights of the layer. The callback gets the specific layer it regularizes as a parameter, and is passed to the fit function:
def RLN(layers=4, **rln_kwargs): def build_fn(): model = base_model(layers=layers)() # For efficiency we only regularized the first layer rln_callback = RLNCallback(model.layers, **rln_kwargs) # Change the fit function of the model to except rln_callback: orig_fit = model.fit def rln_fit(*args, **fit_kwargs): orig_callbacks = fit_kwargs.get('callbacks', ) rln_callbacks = orig_callbacks + [rln_callback] return orig_fit(*args, callbacks=rln_callbacks, **fit_kwargs) model.fit = rln_fit return model return build_fn
Applying RLN on this dataset to find the optimal average regularization (Theta) and learning rate (nu).
best_rln_score = np.inf Theta, learning_rate = None, None for cur_Theta, log_learning_rate in [(-8, 6), (-10, 5), (-10, 6), (-10, 7), (-12, 6)]: cur_learning_rate = np.power(10, log_learning_rate) cur_score = test_model(RLN(layers=layers, norm=1, avg_reg=cur_Theta, learning_rate=cur_learning_rate), "RLN with Theta=%s and learning_rate=%.1E" % (cur_Theta, cur_learning_rate)) if cur_score < best_rln_score: Theta, learning_rate = cur_Theta, cur_learning_rate best_rln_score = cur_score print "The best RLN is achieved with Theta=%d and learning_rate=%.1E" % (Theta, learning_rate)
RLN with Theta=-8 and learning_rate=1.0E+06: 38.91 (2.31) MSE RLN with Theta=-10 and learning_rate=1.0E+05: 34.60 (4.27) MSE RLN with Theta=-10 and learning_rate=1.0E+06: 21.34 (2.37) MSE RLN with Theta=-10 and learning_rate=1.0E+07: 99.58 (8.51) MSE RLN with Theta=-12 and learning_rate=1.0E+06: 86.83 (4.99) MSE The best RLN is achieved with Theta=-10 and learning_rate=1.0E+06
print "We see that RLN outperforms L1 regularization on this dataset, %.2f < %.2f" %(best_rln_score, best_l1_score) print "We also see that the average regularization required in RLN is much smaller than required in L1 regularized models:" print "%.1E << %.1E" % (np.exp(Theta), l1)
We see that RLN outperforms L1 regularization on this dataset, 21.34 < 48.57 We also see that the average regularization required in RLN is much smaller than required in L1 regularized models: 4.5E-05 << 1.0E-01