Chapter 11 – Deep Learning
This notebook contains all the sample code and solutions to the exercises in chapter 11.
First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
# Common imports
import numpy as np
import os
# to make this notebook's output stable across runs
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"
def save_fig(fig_id, tight_layout=True):
path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format='png', dpi=300)
def logit(z):
return 1 / (1 + np.exp(-z))
z = np.linspace(-5, 5, 200)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [1, 1], 'k--')
plt.plot([0, 0], [-0.2, 1.2], 'k-')
plt.plot([-5, 5], [-3/4, 7/4], 'g--')
plt.plot(z, logit(z), "b-", linewidth=2)
props = dict(facecolor='black', shrink=0.1)
plt.annotate('Saturating', xytext=(3.5, 0.7), xy=(5, 1), arrowprops=props, fontsize=14, ha="center")
plt.annotate('Saturating', xytext=(-3.5, 0.3), xy=(-5, 0), arrowprops=props, fontsize=14, ha="center")
plt.annotate('Linear', xytext=(2, 0.2), xy=(0, 0.5), arrowprops=props, fontsize=14, ha="center")
plt.grid(True)
plt.title("Sigmoid activation function", fontsize=14)
plt.axis([-5, 5, -0.2, 1.2])
save_fig("sigmoid_saturation_plot")
plt.show()
Saving figure sigmoid_saturation_plot
Note: the book uses tensorflow.contrib.layers.fully_connected()
rather than tf.layers.dense()
(which did not exist when this chapter was written). It is now preferable to use tf.layers.dense()
, because anything in the contrib module may change or be deleted without notice. The dense()
function is almost identical to the fully_connected()
function. The main differences relevant to this chapter are:
scope
becomes name
, activation_fn
becomes activation
(and similarly the _fn
suffix is removed from other parameters such as normalizer_fn
), weights_initializer
becomes kernel_initializer
, etc.activation
is now None
rather than tf.nn.relu
.tensorflow.contrib.framework.arg_scope()
(introduced later in chapter 11).import tensorflow as tf
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
he_init = tf.contrib.layers.variance_scaling_initializer()
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
kernel_initializer=he_init, name="hidden1")
def leaky_relu(z, alpha=0.01):
return np.maximum(alpha*z, z)
plt.plot(z, leaky_relu(z, 0.05), "b-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([0, 0], [-0.5, 4.2], 'k-')
plt.grid(True)
props = dict(facecolor='black', shrink=0.1)
plt.annotate('Leak', xytext=(-3.5, 0.5), xy=(-5, -0.2), arrowprops=props, fontsize=14, ha="center")
plt.title("Leaky ReLU activation function", fontsize=14)
plt.axis([-5, 5, -0.5, 4.2])
save_fig("leaky_relu_plot")
plt.show()
Saving figure leaky_relu_plot
Implementing Leaky ReLU in TensorFlow:
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
def leaky_relu(z, name=None):
return tf.maximum(0.01 * z, z, name=name)
hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name="hidden1")
Let's train a neural network on MNIST using the Leaky ReLU. First let's create the graph:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=leaky_relu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Let's load the data:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
Extracting /tmp/data/train-images-idx3-ubyte.gz Extracting /tmp/data/train-labels-idx1-ubyte.gz Extracting /tmp/data/t10k-images-idx3-ubyte.gz Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
n_epochs = 40
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if epoch % 5 == 0:
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
acc_test = accuracy.eval(feed_dict={X: mnist.validation.images, y: mnist.validation.labels})
print(epoch, "Batch accuracy:", acc_train, "Validation accuracy:", acc_test)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Batch accuracy: 0.86 Validation accuracy: 0.9044 5 Batch accuracy: 0.94 Validation accuracy: 0.9508 10 Batch accuracy: 0.96 Validation accuracy: 0.9666 15 Batch accuracy: 1.0 Validation accuracy: 0.9722 20 Batch accuracy: 1.0 Validation accuracy: 0.975 25 Batch accuracy: 1.0 Validation accuracy: 0.9766 30 Batch accuracy: 0.98 Validation accuracy: 0.9782 35 Batch accuracy: 0.96 Validation accuracy: 0.9792
def elu(z, alpha=1):
return np.where(z < 0, alpha * (np.exp(z) - 1), z)
plt.plot(z, elu(z), "b-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [-1, -1], 'k--')
plt.plot([0, 0], [-2.2, 3.2], 'k-')
plt.grid(True)
plt.title(r"ELU activation function ($\alpha=1$)", fontsize=14)
plt.axis([-5, 5, -2.2, 3.2])
save_fig("elu_plot")
plt.show()
Saving figure elu_plot
Implementing ELU in TensorFlow is trivial, just specify the activation function when building each layer:
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name="hidden1")
This activation function was proposed in this great paper by Günter Klambauer, Thomas Unterthiner and Andreas Mayr, published in June 2017 (I will definitely add it to the book). It outperforms the other activation functions very significantly for deep neural networks, so you should really try it out.
def selu(z,
scale=1.0507009873554804934193349852946,
alpha=1.6732632423543772848170429916717):
return scale * elu(z, alpha)
plt.plot(z, selu(z), "b-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [-1.758, -1.758], 'k--')
plt.plot([0, 0], [-2.2, 3.2], 'k-')
plt.grid(True)
plt.title(r"SELU activation function", fontsize=14)
plt.axis([-5, 5, -2.2, 3.2])
save_fig("selu_plot")
plt.show()
Saving figure selu_plot
With this activation function, even a 100 layer deep neural network preserves roughly mean 0 and standard deviation 1 across all layers, avoiding the exploding/vanishing gradients problem:
np.random.seed(42)
Z = np.random.normal(size=(500, 100))
for layer in range(100):
W = np.random.normal(size=(100, 100), scale=np.sqrt(1/100))
Z = selu(np.dot(Z, W))
means = np.mean(Z, axis=1)
stds = np.std(Z, axis=1)
if layer % 10 == 0:
print("Layer {}: {:.2f} < mean < {:.2f}, {:.2f} < std deviation < {:.2f}".format(
layer, means.min(), means.max(), stds.min(), stds.max()))
Layer 0: -0.26 < mean < 0.27, 0.74 < std deviation < 1.27 Layer 10: -0.24 < mean < 0.27, 0.74 < std deviation < 1.27 Layer 20: -0.17 < mean < 0.18, 0.74 < std deviation < 1.24 Layer 30: -0.27 < mean < 0.24, 0.78 < std deviation < 1.20 Layer 40: -0.38 < mean < 0.39, 0.74 < std deviation < 1.25 Layer 50: -0.27 < mean < 0.31, 0.73 < std deviation < 1.27 Layer 60: -0.26 < mean < 0.43, 0.74 < std deviation < 1.35 Layer 70: -0.19 < mean < 0.21, 0.75 < std deviation < 1.21 Layer 80: -0.18 < mean < 0.16, 0.72 < std deviation < 1.19 Layer 90: -0.19 < mean < 0.16, 0.75 < std deviation < 1.20
Here's a TensorFlow implementation (there will almost certainly be a tf.nn.selu()
function in future TensorFlow versions):
def selu(z,
scale=1.0507009873554804934193349852946,
alpha=1.6732632423543772848170429916717):
return scale * tf.where(z >= 0.0, z, alpha * tf.nn.elu(z))
SELUs can also be combined with dropout, check out this implementation by the Institute of Bioinformatics, Johannes Kepler University Linz.
Let's create a neural net for MNIST using the SELU activation function:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=selu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=selu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 40
batch_size = 50
Now let's train it. Do not forget to scale the inputs to mean 0 and standard deviation 1:
means = mnist.train.images.mean(axis=0, keepdims=True)
stds = mnist.train.images.std(axis=0, keepdims=True) + 1e-10
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
X_batch_scaled = (X_batch - means) / stds
sess.run(training_op, feed_dict={X: X_batch_scaled, y: y_batch})
if epoch % 5 == 0:
acc_train = accuracy.eval(feed_dict={X: X_batch_scaled, y: y_batch})
X_val_scaled = (mnist.validation.images - means) / stds
acc_test = accuracy.eval(feed_dict={X: X_val_scaled, y: mnist.validation.labels})
print(epoch, "Batch accuracy:", acc_train, "Validation accuracy:", acc_test)
save_path = saver.save(sess, "./my_model_final_selu.ckpt")
0 Batch accuracy: 0.96 Validation accuracy: 0.924 5 Batch accuracy: 1.0 Validation accuracy: 0.9568 10 Batch accuracy: 0.94 Validation accuracy: 0.9668 15 Batch accuracy: 0.98 Validation accuracy: 0.9684 20 Batch accuracy: 1.0 Validation accuracy: 0.9712 25 Batch accuracy: 1.0 Validation accuracy: 0.9694 30 Batch accuracy: 1.0 Validation accuracy: 0.97 35 Batch accuracy: 1.0 Validation accuracy: 0.971
Note: the book uses tensorflow.contrib.layers.batch_norm()
rather than tf.layers.batch_normalization()
(which did not exist when this chapter was written). It is now preferable to use tf.layers.batch_normalization()
, because anything in the contrib module may change or be deleted without notice. Instead of using the batch_norm()
function as a regularizer parameter to the fully_connected()
function, we now use batch_normalization()
and we explicitly create a distinct layer. The parameters are a bit different, in particular:
decay
is renamed to momentum
,is_training
is renamed to training
,updates_collections
is removed: the update operations needed by batch normalization are added to the UPDATE_OPS
collection and you need to explicity run these operations during training (see the execution phase below),scale=True
, as that is the default.Also note that in order to run batch norm just before each hidden layer's activation function, we apply the ELU activation function manually, right after the batch norm layer.
Note: since the tf.layers.dense()
function is incompatible with tf.contrib.layers.arg_scope()
(which is used in the book), we now use python's functools.partial()
function instead. It makes it easy to create a my_dense_layer()
function that just calls tf.layers.dense()
with the desired parameters automatically set (unless they are overridden when calling my_dense_layer()
). As you can see, the code remains very similar.
reset_graph()
import tensorflow as tf
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
training = tf.placeholder_with_default(False, shape=(), name='training')
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
bn1_act = tf.nn.elu(bn1)
hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
bn2_act = tf.nn.elu(bn2)
logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
logits = tf.layers.batch_normalization(logits_before_bn, training=training,
momentum=0.9)
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
training = tf.placeholder_with_default(False, shape=(), name='training')
To avoid repeating the same parameters over and over again, we can use Python's partial()
function:
from functools import partial
my_batch_norm_layer = partial(tf.layers.batch_normalization,
training=training, momentum=0.9)
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
bn1 = my_batch_norm_layer(hidden1)
bn1_act = tf.nn.elu(bn1)
hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
bn2 = my_batch_norm_layer(hidden2)
bn2_act = tf.nn.elu(bn2)
logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
Let's build a neural net for MNIST, using the ELU activation function and Batch Normalization at each layer:
reset_graph()
batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.contrib.layers.variance_scaling_initializer()
my_batch_norm_layer = partial(
tf.layers.batch_normalization,
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(
tf.layers.dense,
kernel_initializer=he_init)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Note: since we are using tf.layers.batch_normalization()
rather than tf.contrib.layers.batch_norm()
(as in the book), we need to explicitly run the extra update operations needed by batch normalization (sess.run([training_op, extra_update_ops],...
).
n_epochs = 20
batch_size = 200
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run([training_op, extra_update_ops],
feed_dict={training: True, X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Test accuracy: 0.8727 1 Test accuracy: 0.8981 2 Test accuracy: 0.9129 3 Test accuracy: 0.922 4 Test accuracy: 0.9292 5 Test accuracy: 0.9342 6 Test accuracy: 0.9381 7 Test accuracy: 0.9419 8 Test accuracy: 0.9451 9 Test accuracy: 0.9471 10 Test accuracy: 0.9507 11 Test accuracy: 0.9521 12 Test accuracy: 0.9553 13 Test accuracy: 0.956 14 Test accuracy: 0.957 15 Test accuracy: 0.9583 16 Test accuracy: 0.9613 17 Test accuracy: 0.9608 18 Test accuracy: 0.9627 19 Test accuracy: 0.963
What!? That's not a great accuracy for MNIST. Of course, if you train for longer it will get much better accuracy, but with such a shallow network, Batch Norm and ELU are unlikely to have very positive impact: they shine mostly for much deeper nets.
Note that you could also make the training operation depend on the update operations:
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
training_op = optimizer.minimize(loss)
This way, you would just have to evaluate the training_op
during training, TensorFlow would automatically run the update operations as well:
sess.run(training_op, feed_dict={training: True, X: X_batch, y: y_batch})
One more thing: notice that the list of trainable variables is shorter than the list of all global variables. This is because the moving averages are non-trainable variables. If you want to reuse a pretrained neural network (see below), you must not forget these non-trainable variables.
[v.name for v in tf.trainable_variables()]
['hidden1/kernel:0', 'hidden1/bias:0', 'batch_normalization/beta:0', 'batch_normalization/gamma:0', 'hidden2/kernel:0', 'hidden2/bias:0', 'batch_normalization_1/beta:0', 'batch_normalization_1/gamma:0', 'outputs/kernel:0', 'outputs/bias:0', 'batch_normalization_2/beta:0', 'batch_normalization_2/gamma:0']
[v.name for v in tf.global_variables()]
['hidden1/kernel:0', 'hidden1/bias:0', 'batch_normalization/beta:0', 'batch_normalization/gamma:0', 'batch_normalization/moving_mean:0', 'batch_normalization/moving_variance:0', 'hidden2/kernel:0', 'hidden2/bias:0', 'batch_normalization_1/beta:0', 'batch_normalization_1/gamma:0', 'batch_normalization_1/moving_mean:0', 'batch_normalization_1/moving_variance:0', 'outputs/kernel:0', 'outputs/bias:0', 'batch_normalization_2/beta:0', 'batch_normalization_2/gamma:0', 'batch_normalization_2/moving_mean:0', 'batch_normalization_2/moving_variance:0']
Let's create a simple neural net for MNIST and add gradient clipping. The first part is the same as earlier (except we added a few more layers to demonstrate reusing pretrained models, see below):
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_hidden5 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name="hidden5")
logits = tf.layers.dense(hidden5, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
Now we apply gradient clipping. For this, we need to get the gradients, use the clip_by_value()
function to clip them, then apply them:
threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)
The rest is the same as usual:
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 200
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Test accuracy: 0.3139 1 Test accuracy: 0.8001 2 Test accuracy: 0.8806 3 Test accuracy: 0.9037 4 Test accuracy: 0.9124 5 Test accuracy: 0.9197 6 Test accuracy: 0.9243 7 Test accuracy: 0.9299 8 Test accuracy: 0.9331 9 Test accuracy: 0.9387 10 Test accuracy: 0.9431 11 Test accuracy: 0.9445 12 Test accuracy: 0.9455 13 Test accuracy: 0.9485 14 Test accuracy: 0.9524 15 Test accuracy: 0.9511 16 Test accuracy: 0.9562 17 Test accuracy: 0.9583 18 Test accuracy: 0.9559 19 Test accuracy: 0.9605
First you need to load the graph's structure. The import_meta_graph()
function does just that, loading the graph's operations into the default graph, and returning a Saver
that you can then use to restore the model's state. Note that by default, a Saver
saves the structure of the graph into a .meta
file, so that's the file you should load:
reset_graph()
saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta")
Next you need to get a handle on all the operations you will need for training. If you don't know the graph's structure, you can list all the operations:
for op in tf.get_default_graph().get_operations():
print(op.name)
X y hidden1/kernel/Initializer/random_uniform/shape hidden1/kernel/Initializer/random_uniform/min hidden1/kernel/Initializer/random_uniform/max hidden1/kernel/Initializer/random_uniform/RandomUniform hidden1/kernel/Initializer/random_uniform/sub hidden1/kernel/Initializer/random_uniform/mul hidden1/kernel/Initializer/random_uniform hidden1/kernel hidden1/kernel/Assign hidden1/kernel/read hidden1/bias/Initializer/Const hidden1/bias hidden1/bias/Assign hidden1/bias/read dnn/hidden1/MatMul dnn/hidden1/BiasAdd dnn/hidden1/Relu hidden2/kernel/Initializer/random_uniform/shape hidden2/kernel/Initializer/random_uniform/min hidden2/kernel/Initializer/random_uniform/max hidden2/kernel/Initializer/random_uniform/RandomUniform hidden2/kernel/Initializer/random_uniform/sub hidden2/kernel/Initializer/random_uniform/mul hidden2/kernel/Initializer/random_uniform hidden2/kernel hidden2/kernel/Assign hidden2/kernel/read hidden2/bias/Initializer/Const hidden2/bias hidden2/bias/Assign hidden2/bias/read dnn/hidden2/MatMul dnn/hidden2/BiasAdd dnn/hidden2/Relu hidden3/kernel/Initializer/random_uniform/shape hidden3/kernel/Initializer/random_uniform/min hidden3/kernel/Initializer/random_uniform/max hidden3/kernel/Initializer/random_uniform/RandomUniform hidden3/kernel/Initializer/random_uniform/sub hidden3/kernel/Initializer/random_uniform/mul hidden3/kernel/Initializer/random_uniform hidden3/kernel hidden3/kernel/Assign hidden3/kernel/read hidden3/bias/Initializer/Const hidden3/bias hidden3/bias/Assign hidden3/bias/read dnn/hidden3/MatMul dnn/hidden3/BiasAdd dnn/hidden3/Relu hidden4/kernel/Initializer/random_uniform/shape hidden4/kernel/Initializer/random_uniform/min hidden4/kernel/Initializer/random_uniform/max hidden4/kernel/Initializer/random_uniform/RandomUniform hidden4/kernel/Initializer/random_uniform/sub hidden4/kernel/Initializer/random_uniform/mul hidden4/kernel/Initializer/random_uniform hidden4/kernel hidden4/kernel/Assign hidden4/kernel/read hidden4/bias/Initializer/Const hidden4/bias hidden4/bias/Assign hidden4/bias/read dnn/hidden4/MatMul dnn/hidden4/BiasAdd dnn/hidden4/Relu hidden5/kernel/Initializer/random_uniform/shape hidden5/kernel/Initializer/random_uniform/min hidden5/kernel/Initializer/random_uniform/max hidden5/kernel/Initializer/random_uniform/RandomUniform hidden5/kernel/Initializer/random_uniform/sub hidden5/kernel/Initializer/random_uniform/mul hidden5/kernel/Initializer/random_uniform hidden5/kernel hidden5/kernel/Assign hidden5/kernel/read hidden5/bias/Initializer/Const hidden5/bias hidden5/bias/Assign hidden5/bias/read dnn/hidden5/MatMul dnn/hidden5/BiasAdd dnn/hidden5/Relu outputs/kernel/Initializer/random_uniform/shape outputs/kernel/Initializer/random_uniform/min outputs/kernel/Initializer/random_uniform/max outputs/kernel/Initializer/random_uniform/RandomUniform outputs/kernel/Initializer/random_uniform/sub outputs/kernel/Initializer/random_uniform/mul outputs/kernel/Initializer/random_uniform outputs/kernel outputs/kernel/Assign outputs/kernel/read outputs/bias/Initializer/Const outputs/bias outputs/bias/Assign outputs/bias/read dnn/outputs/MatMul dnn/outputs/BiasAdd loss/SparseSoftmaxCrossEntropyWithLogits/Shape loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits loss/Const loss/loss gradients/Shape gradients/Const gradients/Fill gradients/loss/loss_grad/Reshape/shape gradients/loss/loss_grad/Reshape gradients/loss/loss_grad/Shape gradients/loss/loss_grad/Tile gradients/loss/loss_grad/Shape_1 gradients/loss/loss_grad/Shape_2 gradients/loss/loss_grad/Const gradients/loss/loss_grad/Prod gradients/loss/loss_grad/Const_1 gradients/loss/loss_grad/Prod_1 gradients/loss/loss_grad/Maximum/y gradients/loss/loss_grad/Maximum gradients/loss/loss_grad/floordiv gradients/loss/loss_grad/Cast gradients/loss/loss_grad/truediv gradients/zeros_like gradients/loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/PreventGradient gradients/loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim gradients/loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/ExpandDims gradients/loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/mul gradients/dnn/outputs/BiasAdd_grad/BiasAddGrad gradients/dnn/outputs/BiasAdd_grad/tuple/group_deps gradients/dnn/outputs/BiasAdd_grad/tuple/control_dependency gradients/dnn/outputs/BiasAdd_grad/tuple/control_dependency_1 gradients/dnn/outputs/MatMul_grad/MatMul gradients/dnn/outputs/MatMul_grad/MatMul_1 gradients/dnn/outputs/MatMul_grad/tuple/group_deps gradients/dnn/outputs/MatMul_grad/tuple/control_dependency gradients/dnn/outputs/MatMul_grad/tuple/control_dependency_1 gradients/dnn/hidden5/Relu_grad/ReluGrad gradients/dnn/hidden5/BiasAdd_grad/BiasAddGrad gradients/dnn/hidden5/BiasAdd_grad/tuple/group_deps gradients/dnn/hidden5/BiasAdd_grad/tuple/control_dependency gradients/dnn/hidden5/BiasAdd_grad/tuple/control_dependency_1 gradients/dnn/hidden5/MatMul_grad/MatMul gradients/dnn/hidden5/MatMul_grad/MatMul_1 gradients/dnn/hidden5/MatMul_grad/tuple/group_deps gradients/dnn/hidden5/MatMul_grad/tuple/control_dependency gradients/dnn/hidden5/MatMul_grad/tuple/control_dependency_1 gradients/dnn/hidden4/Relu_grad/ReluGrad gradients/dnn/hidden4/BiasAdd_grad/BiasAddGrad gradients/dnn/hidden4/BiasAdd_grad/tuple/group_deps gradients/dnn/hidden4/BiasAdd_grad/tuple/control_dependency gradients/dnn/hidden4/BiasAdd_grad/tuple/control_dependency_1 gradients/dnn/hidden4/MatMul_grad/MatMul gradients/dnn/hidden4/MatMul_grad/MatMul_1 gradients/dnn/hidden4/MatMul_grad/tuple/group_deps gradients/dnn/hidden4/MatMul_grad/tuple/control_dependency gradients/dnn/hidden4/MatMul_grad/tuple/control_dependency_1 gradients/dnn/hidden3/Relu_grad/ReluGrad gradients/dnn/hidden3/BiasAdd_grad/BiasAddGrad gradients/dnn/hidden3/BiasAdd_grad/tuple/group_deps gradients/dnn/hidden3/BiasAdd_grad/tuple/control_dependency gradients/dnn/hidden3/BiasAdd_grad/tuple/control_dependency_1 gradients/dnn/hidden3/MatMul_grad/MatMul gradients/dnn/hidden3/MatMul_grad/MatMul_1 gradients/dnn/hidden3/MatMul_grad/tuple/group_deps gradients/dnn/hidden3/MatMul_grad/tuple/control_dependency gradients/dnn/hidden3/MatMul_grad/tuple/control_dependency_1 gradients/dnn/hidden2/Relu_grad/ReluGrad gradients/dnn/hidden2/BiasAdd_grad/BiasAddGrad gradients/dnn/hidden2/BiasAdd_grad/tuple/group_deps gradients/dnn/hidden2/BiasAdd_grad/tuple/control_dependency gradients/dnn/hidden2/BiasAdd_grad/tuple/control_dependency_1 gradients/dnn/hidden2/MatMul_grad/MatMul gradients/dnn/hidden2/MatMul_grad/MatMul_1 gradients/dnn/hidden2/MatMul_grad/tuple/group_deps gradients/dnn/hidden2/MatMul_grad/tuple/control_dependency gradients/dnn/hidden2/MatMul_grad/tuple/control_dependency_1 gradients/dnn/hidden1/Relu_grad/ReluGrad gradients/dnn/hidden1/BiasAdd_grad/BiasAddGrad gradients/dnn/hidden1/BiasAdd_grad/tuple/group_deps gradients/dnn/hidden1/BiasAdd_grad/tuple/control_dependency gradients/dnn/hidden1/BiasAdd_grad/tuple/control_dependency_1 gradients/dnn/hidden1/MatMul_grad/MatMul gradients/dnn/hidden1/MatMul_grad/MatMul_1 gradients/dnn/hidden1/MatMul_grad/tuple/group_deps gradients/dnn/hidden1/MatMul_grad/tuple/control_dependency gradients/dnn/hidden1/MatMul_grad/tuple/control_dependency_1 clip_by_value/Minimum/y clip_by_value/Minimum clip_by_value/y clip_by_value clip_by_value_1/Minimum/y clip_by_value_1/Minimum clip_by_value_1/y clip_by_value_1 clip_by_value_2/Minimum/y clip_by_value_2/Minimum clip_by_value_2/y clip_by_value_2 clip_by_value_3/Minimum/y clip_by_value_3/Minimum clip_by_value_3/y clip_by_value_3 clip_by_value_4/Minimum/y clip_by_value_4/Minimum clip_by_value_4/y clip_by_value_4 clip_by_value_5/Minimum/y clip_by_value_5/Minimum clip_by_value_5/y clip_by_value_5 clip_by_value_6/Minimum/y clip_by_value_6/Minimum clip_by_value_6/y clip_by_value_6 clip_by_value_7/Minimum/y clip_by_value_7/Minimum clip_by_value_7/y clip_by_value_7 clip_by_value_8/Minimum/y clip_by_value_8/Minimum clip_by_value_8/y clip_by_value_8 clip_by_value_9/Minimum/y clip_by_value_9/Minimum clip_by_value_9/y clip_by_value_9 clip_by_value_10/Minimum/y clip_by_value_10/Minimum clip_by_value_10/y clip_by_value_10 clip_by_value_11/Minimum/y clip_by_value_11/Minimum clip_by_value_11/y clip_by_value_11 GradientDescent/learning_rate GradientDescent/update_hidden1/kernel/ApplyGradientDescent GradientDescent/update_hidden1/bias/ApplyGradientDescent GradientDescent/update_hidden2/kernel/ApplyGradientDescent GradientDescent/update_hidden2/bias/ApplyGradientDescent GradientDescent/update_hidden3/kernel/ApplyGradientDescent GradientDescent/update_hidden3/bias/ApplyGradientDescent GradientDescent/update_hidden4/kernel/ApplyGradientDescent GradientDescent/update_hidden4/bias/ApplyGradientDescent GradientDescent/update_hidden5/kernel/ApplyGradientDescent GradientDescent/update_hidden5/bias/ApplyGradientDescent GradientDescent/update_outputs/kernel/ApplyGradientDescent GradientDescent/update_outputs/bias/ApplyGradientDescent GradientDescent eval/InTopK eval/Cast eval/Const eval/accuracy init save/Const save/SaveV2/tensor_names save/SaveV2/shape_and_slices save/SaveV2 save/control_dependency save/RestoreV2/tensor_names save/RestoreV2/shape_and_slices save/RestoreV2 save/Assign save/RestoreV2_1/tensor_names save/RestoreV2_1/shape_and_slices save/RestoreV2_1 save/Assign_1 save/RestoreV2_2/tensor_names save/RestoreV2_2/shape_and_slices save/RestoreV2_2 save/Assign_2 save/RestoreV2_3/tensor_names save/RestoreV2_3/shape_and_slices save/RestoreV2_3 save/Assign_3 save/RestoreV2_4/tensor_names save/RestoreV2_4/shape_and_slices save/RestoreV2_4 save/Assign_4 save/RestoreV2_5/tensor_names save/RestoreV2_5/shape_and_slices save/RestoreV2_5 save/Assign_5 save/RestoreV2_6/tensor_names save/RestoreV2_6/shape_and_slices save/RestoreV2_6 save/Assign_6 save/RestoreV2_7/tensor_names save/RestoreV2_7/shape_and_slices save/RestoreV2_7 save/Assign_7 save/RestoreV2_8/tensor_names save/RestoreV2_8/shape_and_slices save/RestoreV2_8 save/Assign_8 save/RestoreV2_9/tensor_names save/RestoreV2_9/shape_and_slices save/RestoreV2_9 save/Assign_9 save/RestoreV2_10/tensor_names save/RestoreV2_10/shape_and_slices save/RestoreV2_10 save/Assign_10 save/RestoreV2_11/tensor_names save/RestoreV2_11/shape_and_slices save/RestoreV2_11 save/Assign_11 save/restore_all
Oops, that's a lot of operations! It's much easier to use TensorBoard to visualize the graph. The following hack will allow you to visualize the graph within Jupyter (if it does not work with your browser, you will need to use a FileWriter
to save the graph and then visualize it in TensorBoard):
from IPython.display import clear_output, Image, display, HTML
def strip_consts(graph_def, max_const_size=32):
"""Strip large constant values from graph_def."""
strip_def = tf.GraphDef()
for n0 in graph_def.node:
n = strip_def.node.add()
n.MergeFrom(n0)
if n.op == 'Const':
tensor = n.attr['value'].tensor
size = len(tensor.tensor_content)
if size > max_const_size:
tensor.tensor_content = b"<stripped %d bytes>"%size
return strip_def
def show_graph(graph_def, max_const_size=32):
"""Visualize TensorFlow graph."""
if hasattr(graph_def, 'as_graph_def'):
graph_def = graph_def.as_graph_def()
strip_def = strip_consts(graph_def, max_const_size=max_const_size)
code = """
<script>
function load() {{
document.getElementById("{id}").pbtxt = {data};
}}
</script>
<link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
<div style="height:600px">
<tf-graph-basic id="{id}"></tf-graph-basic>
</div>
""".format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))
iframe = """
<iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
""".format(code.replace('"', '"'))
display(HTML(iframe))
show_graph(tf.get_default_graph())
Once you know which operations you need, you can get a handle on them using the graph's get_operation_by_name()
or get_tensor_by_name()
methods:
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
accuracy = tf.get_default_graph().get_tensor_by_name("eval/accuracy:0")
training_op = tf.get_default_graph().get_operation_by_name("GradientDescent")
If you are the author of the original model, you could make things easier for people who will reuse your model by giving operations very clear names and documenting them. Another approach is to create a collection containing all the important operations that people will want to get a handle on:
for op in (X, y, accuracy, training_op):
tf.add_to_collection("my_important_ops", op)
This way people who reuse your model will be able to simply write:
X, y, accuracy, training_op = tf.get_collection("my_important_ops")
Now you can start a session, restore the model's state and continue training on your data:
with tf.Session() as sess:
saver.restore(sess, "./my_model_final.ckpt")
# continue training the model...
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt
Actually, let's test this for real!
with tf.Session() as sess:
saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Test accuracy: 0.9609 1 Test accuracy: 0.9608 2 Test accuracy: 0.9617 3 Test accuracy: 0.9613 4 Test accuracy: 0.9639 5 Test accuracy: 0.9649 6 Test accuracy: 0.9663 7 Test accuracy: 0.9627 8 Test accuracy: 0.9665 9 Test accuracy: 0.9669 10 Test accuracy: 0.9662 11 Test accuracy: 0.9674 12 Test accuracy: 0.9678 13 Test accuracy: 0.9679 14 Test accuracy: 0.9688 15 Test accuracy: 0.9684 16 Test accuracy: 0.9687 17 Test accuracy: 0.9702 18 Test accuracy: 0.9673 19 Test accuracy: 0.9687
Alternatively, if you have access to the Python code that built the original graph, you can use it instead of import_meta_graph()
:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name="hidden5")
logits = tf.layers.dense(hidden5, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
learning_rate = 0.01
threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
And continue training:
with tf.Session() as sess:
saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Test accuracy: 0.9611 1 Test accuracy: 0.9619 2 Test accuracy: 0.9622 3 Test accuracy: 0.9619 4 Test accuracy: 0.9644 5 Test accuracy: 0.9633 6 Test accuracy: 0.9647 7 Test accuracy: 0.9648 8 Test accuracy: 0.9671 9 Test accuracy: 0.9677 10 Test accuracy: 0.9676 11 Test accuracy: 0.9679 12 Test accuracy: 0.9687 13 Test accuracy: 0.9688 14 Test accuracy: 0.9683 15 Test accuracy: 0.9693 16 Test accuracy: 0.9677 17 Test accuracy: 0.9697 18 Test accuracy: 0.9692 19 Test accuracy: 0.9707
In general you will want to reuse only the lower layers. If you are using import_meta_graph()
it will load the whole graph, but you can simply ignore the parts you do not need. In this example, we add a new 4th hidden layer on top of the pretrained 3rd layer (ignoring the old 4th hidden layer). We also build a new output layer, the loss for this new output, and a new optimizer to minimize it. We also need another saver to save the whole graph (containing both the entire old graph plus the new operations), and an initialization operation to initialize all the new variables:
reset_graph()
n_hidden4 = 20 # new layer
n_outputs = 10 # new layer
saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta")
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
hidden3 = tf.get_default_graph().get_tensor_by_name("dnn/hidden4/Relu:0")
new_hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="new_hidden4")
new_logits = tf.layers.dense(new_hidden4, n_outputs, name="new_outputs")
with tf.name_scope("new_loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=new_logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("new_eval"):
correct = tf.nn.in_top_k(new_logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("new_train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
new_saver = tf.train.Saver()
And we can train this new model:
with tf.Session() as sess:
init.run()
saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = new_saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Test accuracy: 0.9142 1 Test accuracy: 0.9346 2 Test accuracy: 0.9437 3 Test accuracy: 0.9486 4 Test accuracy: 0.9517 5 Test accuracy: 0.9544 6 Test accuracy: 0.9544 7 Test accuracy: 0.9562 8 Test accuracy: 0.9588 9 Test accuracy: 0.9619 10 Test accuracy: 0.9617 11 Test accuracy: 0.9617 12 Test accuracy: 0.9624 13 Test accuracy: 0.9644 14 Test accuracy: 0.9622 15 Test accuracy: 0.964 16 Test accuracy: 0.9666 17 Test accuracy: 0.9668 18 Test accuracy: 0.9673 19 Test accuracy: 0.9687
If you have access to the Python code that built the original graph, you can just reuse the parts you need and drop the rest:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1") # reused
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2") # reused
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3") # reused
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
However, you must create one Saver
to restore the pretrained model (giving it the list of variables to restore, or else it will complain that the graphs don't match), and another Saver
to save the new model, once it is trained:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs): # not shown in the book
for iteration in range(mnist.train.num_examples // batch_size): # not shown
X_batch, y_batch = mnist.train.next_batch(batch_size) # not shown
sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) # not shown
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images, # not shown
y: mnist.test.labels}) # not shown
print(epoch, "Test accuracy:", accuracy_val) # not shown
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Test accuracy: 0.9022 1 Test accuracy: 0.9302 2 Test accuracy: 0.9393 3 Test accuracy: 0.9429 4 Test accuracy: 0.9484 5 Test accuracy: 0.9511 6 Test accuracy: 0.9517 7 Test accuracy: 0.9539 8 Test accuracy: 0.9545 9 Test accuracy: 0.9572 10 Test accuracy: 0.9599 11 Test accuracy: 0.9602 12 Test accuracy: 0.9606 13 Test accuracy: 0.9619 14 Test accuracy: 0.9619 15 Test accuracy: 0.9636 16 Test accuracy: 0.9633 17 Test accuracy: 0.9643 18 Test accuracy: 0.9651 19 Test accuracy: 0.9657
In this example, for each variable we want to reuse, we find its initializer's assignment operation, and we get its second input, which corresponds to the initialization value. When we run the initializer, we replace the initialization values with the ones we want, using a feed_dict
:
reset_graph()
n_inputs = 2
n_hidden1 = 3
original_w = [[1., 2., 3.], [4., 5., 6.]] # Load the weights from the other framework
original_b = [7., 8., 9.] # Load the biases from the other framework
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
# [...] Build the rest of the model
# Get a handle on the assignment nodes for the hidden1 variables
graph = tf.get_default_graph()
assign_kernel = graph.get_operation_by_name("hidden1/kernel/Assign")
assign_bias = graph.get_operation_by_name("hidden1/bias/Assign")
init_kernel = assign_kernel.inputs[1]
init_bias = assign_bias.inputs[1]
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init, feed_dict={init_kernel: original_w, init_bias: original_b})
# [...] Train the model on your new task
print(hidden1.eval(feed_dict={X: [[10.0, 11.0]]})) # not shown in the book
[[ 61. 83. 105.]]
Note: the weights variable created by the tf.layers.dense()
function is called "kernel"
(instead of "weights"
when using the tf.contrib.layers.fully_connected()
, as in the book), and the biases variable is called bias
instead of biases
.
Another approach (initially used in the book) would be to create dedicated assignment nodes and dedicated placeholders. This is more verbose and less efficient, but you may find this more explicit:
reset_graph()
n_inputs = 2
n_hidden1 = 3
original_w = [[1., 2., 3.], [4., 5., 6.]] # Load the weights from the other framework
original_b = [7., 8., 9.] # Load the biases from the other framework
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
# [...] Build the rest of the model
# Get a handle on the variables of layer hidden1
with tf.variable_scope("", default_name="", reuse=True): # root scope
hidden1_weights = tf.get_variable("hidden1/kernel")
hidden1_biases = tf.get_variable("hidden1/bias")
# Create dedicated placeholders and assignment nodes
original_weights = tf.placeholder(tf.float32, shape=(n_inputs, n_hidden1))
original_biases = tf.placeholder(tf.float32, shape=n_hidden1)
assign_hidden1_weights = tf.assign(hidden1_weights, original_weights)
assign_hidden1_biases = tf.assign(hidden1_biases, original_biases)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
sess.run(assign_hidden1_weights, feed_dict={original_weights: original_w})
sess.run(assign_hidden1_biases, feed_dict={original_biases: original_b})
# [...] Train the model on your new task
print(hidden1.eval(feed_dict={X: [[10.0, 11.0]]}))
[[ 61. 83. 105.]]
Note that we could also get a handle on the variables using get_collection()
and specifying the scope
:
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden1")
[<tf.Variable 'hidden1/kernel:0' shape=(2, 3) dtype=float32_ref>, <tf.Variable 'hidden1/bias:0' shape=(3,) dtype=float32_ref>]
Or we could use the graph's get_tensor_by_name()
method:
tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
<tf.Tensor 'hidden1/kernel:0' shape=(2, 3) dtype=float32_ref>
tf.get_default_graph().get_tensor_by_name("hidden1/bias:0")
<tf.Tensor 'hidden1/bias:0' shape=(3,) dtype=float32_ref>
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1") # reused
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2") # reused
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3") # reused
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"): # not shown in the book
optimizer = tf.train.GradientDescentOptimizer(learning_rate) # not shown
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope="hidden[34]|outputs")
training_op = optimizer.minimize(loss, var_list=train_vars)
init = tf.global_variables_initializer()
new_saver = tf.train.Saver()
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Test accuracy: 0.8987 1 Test accuracy: 0.9311 2 Test accuracy: 0.9375 3 Test accuracy: 0.9414 4 Test accuracy: 0.9437 5 Test accuracy: 0.9479 6 Test accuracy: 0.9495 7 Test accuracy: 0.9521 8 Test accuracy: 0.9517 9 Test accuracy: 0.9525 10 Test accuracy: 0.9535 11 Test accuracy: 0.9538 12 Test accuracy: 0.9534 13 Test accuracy: 0.9546 14 Test accuracy: 0.9538 15 Test accuracy: 0.9553 16 Test accuracy: 0.9552 17 Test accuracy: 0.9549 18 Test accuracy: 0.9553 19 Test accuracy: 0.9557
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
name="hidden1") # reused frozen
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
name="hidden2") # reused frozen
hidden2_stop = tf.stop_gradient(hidden2)
hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu,
name="hidden3") # reused, not frozen
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,
name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
The training code is exactly the same as earlier:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Test accuracy: 0.9031 1 Test accuracy: 0.932 2 Test accuracy: 0.94 3 Test accuracy: 0.9435 4 Test accuracy: 0.9473 5 Test accuracy: 0.9492 6 Test accuracy: 0.9498 7 Test accuracy: 0.9493 8 Test accuracy: 0.9515 9 Test accuracy: 0.9519 10 Test accuracy: 0.9529 11 Test accuracy: 0.9536 12 Test accuracy: 0.9529 13 Test accuracy: 0.9532 14 Test accuracy: 0.9522 15 Test accuracy: 0.9534 16 Test accuracy: 0.953 17 Test accuracy: 0.955 18 Test accuracy: 0.955 19 Test accuracy: 0.9552
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
name="hidden1") # reused frozen
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
name="hidden2") # reused frozen & cached
hidden2_stop = tf.stop_gradient(hidden2)
hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu,
name="hidden3") # reused, not frozen
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,
name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
import numpy as np
n_batches = mnist.train.num_examples // batch_size
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
h2_cache = sess.run(hidden2, feed_dict={X: mnist.train.images})
h2_cache_test = sess.run(hidden2, feed_dict={X: mnist.test.images}) # not shown in the book
for epoch in range(n_epochs):
shuffled_idx = np.random.permutation(mnist.train.num_examples)
hidden2_batches = np.array_split(h2_cache[shuffled_idx], n_batches)
y_batches = np.array_split(mnist.train.labels[shuffled_idx], n_batches)
for hidden2_batch, y_batch in zip(hidden2_batches, y_batches):
sess.run(training_op, feed_dict={hidden2:hidden2_batch, y:y_batch})
accuracy_val = accuracy.eval(feed_dict={hidden2: h2_cache_test, # not shown
y: mnist.test.labels}) # not shown
print(epoch, "Test accuracy:", accuracy_val) # not shown
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Test accuracy: 0.9033 1 Test accuracy: 0.9322 2 Test accuracy: 0.9423 3 Test accuracy: 0.9449 4 Test accuracy: 0.9471 5 Test accuracy: 0.9477 6 Test accuracy: 0.951 7 Test accuracy: 0.9507 8 Test accuracy: 0.9514 9 Test accuracy: 0.9522 10 Test accuracy: 0.9512 11 Test accuracy: 0.9521 12 Test accuracy: 0.9522 13 Test accuracy: 0.9539 14 Test accuracy: 0.9536 15 Test accuracy: 0.9534 16 Test accuracy: 0.9547 17 Test accuracy: 0.9537 18 Test accuracy: 0.9542 19 Test accuracy: 0.9547
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
momentum=0.9)
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
momentum=0.9, use_nesterov=True)
optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
momentum=0.9, decay=0.9, epsilon=1e-10)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"): # not shown in the book
initial_learning_rate = 0.1
decay_steps = 10000
decay_rate = 1/10
global_step = tf.Variable(0, trainable=False, name="global_step")
learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step,
decay_steps, decay_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
training_op = optimizer.minimize(loss, global_step=global_step)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 5
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Test accuracy: 0.9579 1 Test accuracy: 0.9691 2 Test accuracy: 0.976 3 Test accuracy: 0.9793 4 Test accuracy: 0.9811
Let's implement $\ell_1$ regularization manually. First, we create the model, as usual (with just one hidden layer this time, for simplicity):
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
logits = tf.layers.dense(hidden1, n_outputs, name="outputs")
Next, we get a handle on the layer weights, and we compute the total loss, which is equal to the sum of the usual cross entropy loss and the $\ell_1$ loss (i.e., the absolute values of the weights):
W1 = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
W2 = tf.get_default_graph().get_tensor_by_name("outputs/kernel:0")
scale = 0.001 # l1 regularization hyperparameter
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
base_loss = tf.reduce_mean(xentropy, name="avg_xentropy")
reg_losses = tf.reduce_sum(tf.abs(W1)) + tf.reduce_sum(tf.abs(W2))
loss = tf.add(base_loss, scale * reg_losses, name="loss")
The rest is just as usual:
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 200
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Test accuracy: 0.8343 1 Test accuracy: 0.8726 2 Test accuracy: 0.8832 3 Test accuracy: 0.8899 4 Test accuracy: 0.8958 5 Test accuracy: 0.8986 6 Test accuracy: 0.9011 7 Test accuracy: 0.9032 8 Test accuracy: 0.9046 9 Test accuracy: 0.9047 10 Test accuracy: 0.9065 11 Test accuracy: 0.9059 12 Test accuracy: 0.9072 13 Test accuracy: 0.9072 14 Test accuracy: 0.9069 15 Test accuracy: 0.9071 16 Test accuracy: 0.9064 17 Test accuracy: 0.9071 18 Test accuracy: 0.9068 19 Test accuracy: 0.9063
Alternatively, we can pass a regularization function to the tf.layers.dense()
function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
Next, we will use Python's partial()
function to avoid repeating the same arguments over and over again. Note that we set the kernel_regularizer
argument:
scale = 0.001
my_dense_layer = partial(
tf.layers.dense, activation=tf.nn.relu,
kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))
with tf.name_scope("dnn"):
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
hidden2 = my_dense_layer(hidden1, n_hidden2, name="hidden2")
logits = my_dense_layer(hidden2, n_outputs, activation=None,
name="outputs")
Next we must add the regularization losses to the base loss:
with tf.name_scope("loss"): # not shown in the book
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( # not shown
labels=y, logits=logits) # not shown
base_loss = tf.reduce_mean(xentropy, name="avg_xentropy") # not shown
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([base_loss] + reg_losses, name="loss")
And the rest is the same as usual:
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 200
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
y: mnist.test.labels})
print(epoch, "Test accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Test accuracy: 0.8298 1 Test accuracy: 0.8778 2 Test accuracy: 0.8917 3 Test accuracy: 0.9017 4 Test accuracy: 0.9068 5 Test accuracy: 0.9103 6 Test accuracy: 0.9125 7 Test accuracy: 0.9137 8 Test accuracy: 0.9149 9 Test accuracy: 0.9174 10 Test accuracy: 0.9176 11 Test accuracy: 0.9184 12 Test accuracy: 0.9191 13 Test accuracy: 0.9183 14 Test accuracy: 0.9195 15 Test accuracy: 0.9201 16 Test accuracy: 0.9181 17 Test accuracy: 0.9184 18 Test accuracy: 0.9181 19 Test accuracy: 0.9174
Note: the book uses tf.contrib.layers.dropout()
rather than tf.layers.dropout()
(which did not exist when this chapter was written). It is now preferable to use tf.layers.dropout()
, because anything in the contrib module may change or be deleted without notice. The tf.layers.dropout()
function is almost identical to the tf.contrib.layers.dropout()
function, except for a few minor differences. Most importantly:
rate
) rather than the keep probability (keep_prob
), where rate
is simply equal to 1 - keep_prob
,is_training
parameter is renamed to training
.reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
dropout_rate = 0.5 # == 1 - keep_prob
X_drop = tf.layers.dropout(X, dropout_rate, training=training)
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu,
name="hidden1")
hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)
hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu,
name="hidden2")
hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)
logits = tf.layers.dense(hidden2_drop, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={training: True, X: X_batch, y: y_batch})
acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
print(epoch, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Test accuracy: 0.9205 1 Test accuracy: 0.9418 2 Test accuracy: 0.9486 3 Test accuracy: 0.9508 4 Test accuracy: 0.954 5 Test accuracy: 0.957 6 Test accuracy: 0.9604 7 Test accuracy: 0.9585 8 Test accuracy: 0.9598 9 Test accuracy: 0.9663 10 Test accuracy: 0.9644 11 Test accuracy: 0.9646 12 Test accuracy: 0.9675 13 Test accuracy: 0.9657 14 Test accuracy: 0.9645 15 Test accuracy: 0.9668 16 Test accuracy: 0.969 17 Test accuracy: 0.9682 18 Test accuracy: 0.9698 19 Test accuracy: 0.9682
Let's go back to a plain and simple neural net for MNIST with just 2 hidden layers:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
learning_rate = 0.01
momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
Next, let's get a handle on the first hidden layer's weight and create an operation that will compute the clipped weights using the clip_by_norm()
function. Then we create an assignment operation to assign the clipped weights to the weights variable:
threshold = 1.0
weights = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
clipped_weights = tf.clip_by_norm(weights, clip_norm=threshold, axes=1)
clip_weights = tf.assign(weights, clipped_weights)
We can do this as well for the second hidden layer:
weights2 = tf.get_default_graph().get_tensor_by_name("hidden2/kernel:0")
clipped_weights2 = tf.clip_by_norm(weights2, clip_norm=threshold, axes=1)
clip_weights2 = tf.assign(weights2, clipped_weights2)
Let's add an initializer and a saver:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
And now we can train the model. It's pretty much as usual, except that right after running the training_op
, we run the clip_weights
and clip_weights2
operations:
n_epochs = 20
batch_size = 50
with tf.Session() as sess: # not shown in the book
init.run() # not shown
for epoch in range(n_epochs): # not shown
for iteration in range(mnist.train.num_examples // batch_size): # not shown
X_batch, y_batch = mnist.train.next_batch(batch_size) # not shown
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
clip_weights.eval()
clip_weights2.eval() # not shown
acc_test = accuracy.eval(feed_dict={X: mnist.test.images, # not shown
y: mnist.test.labels}) # not shown
print(epoch, "Test accuracy:", acc_test) # not shown
save_path = saver.save(sess, "./my_model_final.ckpt") # not shown
0 Test accuracy: 0.9517 1 Test accuracy: 0.9674 2 Test accuracy: 0.9712 3 Test accuracy: 0.9759 4 Test accuracy: 0.975 5 Test accuracy: 0.9761 6 Test accuracy: 0.9765 7 Test accuracy: 0.9796 8 Test accuracy: 0.9791 9 Test accuracy: 0.9794 10 Test accuracy: 0.9805 11 Test accuracy: 0.9809 12 Test accuracy: 0.9807 13 Test accuracy: 0.9799 14 Test accuracy: 0.982 15 Test accuracy: 0.9816 16 Test accuracy: 0.9825 17 Test accuracy: 0.9825 18 Test accuracy: 0.9816 19 Test accuracy: 0.9822
The implementation above is straightforward and it works fine, but it is a bit messy. A better approach is to define a max_norm_regularizer()
function:
def max_norm_regularizer(threshold, axes=1, name="max_norm",
collection="max_norm"):
def max_norm(weights):
clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)
clip_weights = tf.assign(weights, clipped, name=name)
tf.add_to_collection(collection, clip_weights)
return None # there is no regularization loss term
return max_norm
Then you can call this function to get a max norm regularizer (with the threshold you want). When you create a hidden layer, you can pass this regularizer to the kernel_regularizer
argument:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
learning_rate = 0.01
momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
max_norm_reg = max_norm_regularizer(threshold=1.0)
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
kernel_regularizer=max_norm_reg, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
kernel_regularizer=max_norm_reg, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Training is as usual, except you must run the weights clipping operations after each training operation:
n_epochs = 20
batch_size = 50
clip_all_weights = tf.get_collection("max_norm")
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
sess.run(clip_all_weights)
acc_test = accuracy.eval(feed_dict={X: mnist.test.images, # not shown in the book
y: mnist.test.labels}) # not shown
print(epoch, "Test accuracy:", acc_test) # not shown
save_path = saver.save(sess, "./my_model_final.ckpt") # not shown
0 Test accuracy: 0.9527 1 Test accuracy: 0.9653 2 Test accuracy: 0.97 3 Test accuracy: 0.9751 4 Test accuracy: 0.9752 5 Test accuracy: 0.9742 6 Test accuracy: 0.9754 7 Test accuracy: 0.9784 8 Test accuracy: 0.9775 9 Test accuracy: 0.9789 10 Test accuracy: 0.9808 11 Test accuracy: 0.9797 12 Test accuracy: 0.9802 13 Test accuracy: 0.9799 14 Test accuracy: 0.9808 15 Test accuracy: 0.9809 16 Test accuracy: 0.9807 17 Test accuracy: 0.9803 18 Test accuracy: 0.9816 19 Test accuracy: 0.9812
See appendix A.
Exercise: Build a DNN with five hidden layers of 100 neurons each, He initialization, and the ELU activation function.
We will need similar DNNs in the next exercises, so let's create a function to build this DNN:
he_init = tf.contrib.layers.variance_scaling_initializer()
def dnn(inputs, n_hidden_layers=5, n_neurons=100, name=None,
activation=tf.nn.elu, initializer=he_init):
with tf.variable_scope(name, "dnn"):
for layer in range(n_hidden_layers):
inputs = tf.layers.dense(inputs, n_neurons, activation=activation,
kernel_initializer=initializer,
name="hidden%d" % (layer + 1))
return inputs
n_inputs = 28 * 28 # MNIST
n_outputs = 5
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
dnn_outputs = dnn(X)
logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init, name="logits")
Y_proba = tf.nn.softmax(logits, name="Y_proba")
Exercise: Using Adam optimization and early stopping, try training it on MNIST but only on digits 0 to 4, as we will use transfer learning for digits 5 to 9 in the next exercise. You will need a softmax output layer with five neurons, and as always make sure to save checkpoints at regular intervals and save the final model so you can reuse it later.
Let's complete the graph with the cost function, the training op, and all the other usual components:
learning_rate = 0.01
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss, name="training_op")
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Let's fetch the MNIST dataset:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
Extracting /tmp/data/train-images-idx3-ubyte.gz Extracting /tmp/data/train-labels-idx1-ubyte.gz Extracting /tmp/data/t10k-images-idx3-ubyte.gz Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
Now let's create the training set, validation and test set (we need the validation set to implement early stopping):
X_train1 = mnist.train.images[mnist.train.labels < 5]
y_train1 = mnist.train.labels[mnist.train.labels < 5]
X_valid1 = mnist.validation.images[mnist.validation.labels < 5]
y_valid1 = mnist.validation.labels[mnist.validation.labels < 5]
X_test1 = mnist.test.images[mnist.test.labels < 5]
y_test1 = mnist.test.labels[mnist.test.labels < 5]
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train1))
for rnd_indices in np.array_split(rnd_idx, len(X_train1) // batch_size):
X_batch, y_batch = X_train1[rnd_indices], y_train1[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid1, y: y_valid1})
if loss_val < best_loss:
save_path = saver.save(sess, "./my_mnist_model_0_to_4.ckpt")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
saver.restore(sess, "./my_mnist_model_0_to_4.ckpt")
acc_test = accuracy.eval(feed_dict={X: X_test1, y: y_test1})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
0 Validation loss: 0.128663 Best loss: 0.128663 Accuracy: 96.64% 1 Validation loss: 0.448317 Best loss: 0.128663 Accuracy: 78.19% 2 Validation loss: 0.190859 Best loss: 0.128663 Accuracy: 95.54% 3 Validation loss: 0.146951 Best loss: 0.128663 Accuracy: 96.79% 4 Validation loss: 0.086076 Best loss: 0.086076 Accuracy: 97.69% 5 Validation loss: 0.115353 Best loss: 0.086076 Accuracy: 97.77% 6 Validation loss: 0.239142 Best loss: 0.086076 Accuracy: 95.15% 7 Validation loss: 0.088810 Best loss: 0.086076 Accuracy: 98.12% 8 Validation loss: 0.108763 Best loss: 0.086076 Accuracy: 97.81% 9 Validation loss: 0.300808 Best loss: 0.086076 Accuracy: 96.17% 10 Validation loss: 0.179260 Best loss: 0.086076 Accuracy: 97.46% 11 Validation loss: 0.125690 Best loss: 0.086076 Accuracy: 98.48% 12 Validation loss: 0.738371 Best loss: 0.086076 Accuracy: 77.72% 13 Validation loss: 1.894743 Best loss: 0.086076 Accuracy: 78.54% 14 Validation loss: 0.415678 Best loss: 0.086076 Accuracy: 78.50% 15 Validation loss: 0.537646 Best loss: 0.086076 Accuracy: 75.45% 16 Validation loss: 1.009708 Best loss: 0.086076 Accuracy: 53.99% 17 Validation loss: 1.228350 Best loss: 0.086076 Accuracy: 38.15% 18 Validation loss: 1.510606 Best loss: 0.086076 Accuracy: 29.44% 19 Validation loss: 1.632344 Best loss: 0.086076 Accuracy: 22.01% 20 Validation loss: 1.628246 Best loss: 0.086076 Accuracy: 22.01% 21 Validation loss: 1.626765 Best loss: 0.086076 Accuracy: 22.01% 22 Validation loss: 1.651615 Best loss: 0.086076 Accuracy: 18.73% 23 Validation loss: 1.663751 Best loss: 0.086076 Accuracy: 19.27% 24 Validation loss: 1.675138 Best loss: 0.086076 Accuracy: 22.01% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_0_to_4.ckpt Final test accuracy: 98.05%
We get 98.05% accuracy on the test set. That's not too bad, but let's see if we can do better by tuning the hyperparameters.
Exercise: Tune the hyperparameters using cross-validation and see what precision you can achieve.
Let's create a DNNClassifier
class, compatible with Scikit-Learn's RandomizedSearchCV
class, to perform hyperparameter tuning. Here are the key points of this implementation:
__init__()
method (constructor) does nothing more than create instance variables for each of the hyperparameters.fit()
method creates the graph, starts a session and trains the model:_build_graph()
method to build the graph (much lile the graph we defined earlier). Once this method is done creating the graph, it saves all the important operations as instance variables for easy access by other methods._dnn()
method builds the hidden layers, just like the dnn()
function above, but also with support for batch normalization and dropout (for the next exercises).fit()
method is given a validation set (X_valid
and y_valid
), then it implements early stopping. This implementation does not save the best model to disk, but rather to memory: it uses the _get_model_params()
method to get all the graph's variables and their values, and the _restore_model_params()
method to restore the variable values (of the best model found). This trick helps speed up training.fit()
method has finished training the model, it keeps the session open so that predictions can be made quickly, without having to save a model to disk and restore it for every prediction. You can close the session by calling the close_session()
method.predict_proba()
method uses the trained model to predict the class probabilities.predict()
method calls predict_proba()
and returns the class with the highest probability, for each instance.from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
class DNNClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, n_hidden_layers=5, n_neurons=100, optimizer_class=tf.train.AdamOptimizer,
learning_rate=0.01, batch_size=20, activation=tf.nn.elu, initializer=he_init,
batch_norm_momentum=None, dropout_rate=None, random_state=None):
"""Initialize the DNNClassifier by simply storing all the hyperparameters."""
self.n_hidden_layers = n_hidden_layers
self.n_neurons = n_neurons
self.optimizer_class = optimizer_class
self.learning_rate = learning_rate
self.batch_size = batch_size
self.activation = activation
self.initializer = initializer
self.batch_norm_momentum = batch_norm_momentum
self.dropout_rate = dropout_rate
self.random_state = random_state
self._session = None
def _dnn(self, inputs):
"""Build the hidden layers, with support for batch normalization and dropout."""
for layer in range(self.n_hidden_layers):
if self.dropout_rate:
inputs = tf.layers.dropout(inputs, self.dropout_rate, training=self._training)
inputs = tf.layers.dense(inputs, self.n_neurons,
kernel_initializer=self.initializer,
name="hidden%d" % (layer + 1))
if self.batch_norm_momentum:
inputs = tf.layers.batch_normalization(inputs, momentum=self.batch_norm_momentum,
training=self._training)
inputs = self.activation(inputs, name="hidden%d_out" % (layer + 1))
return inputs
def _build_graph(self, n_inputs, n_outputs):
"""Build the same model as earlier"""
if self.random_state is not None:
tf.set_random_seed(self.random_state)
np.random.seed(self.random_state)
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
if self.batch_norm_momentum or self.dropout_rate:
self._training = tf.placeholder_with_default(False, shape=(), name='training')
else:
self._training = None
dnn_outputs = self._dnn(X)
logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init, name="logits")
Y_proba = tf.nn.softmax(logits, name="Y_proba")
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = self.optimizer_class(learning_rate=self.learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# Make the important operations available easily through instance variables
self._X, self._y = X, y
self._Y_proba, self._loss = Y_proba, loss
self._training_op, self._accuracy = training_op, accuracy
self._init, self._saver = init, saver
def close_session(self):
if self._session:
self._session.close()
def _get_model_params(self):
"""Get all variable values (used for early stopping, faster than saving to disk)"""
with self._graph.as_default():
gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
return {gvar.op.name: value for gvar, value in zip(gvars, self._session.run(gvars))}
def _restore_model_params(self, model_params):
"""Set all variables to the given values (for early stopping, faster than loading from disk)"""
gvar_names = list(model_params.keys())
assign_ops = {gvar_name: self._graph.get_operation_by_name(gvar_name + "/Assign")
for gvar_name in gvar_names}
init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
feed_dict = {init_values[gvar_name]: model_params[gvar_name] for gvar_name in gvar_names}
self._session.run(assign_ops, feed_dict=feed_dict)
def fit(self, X, y, n_epochs=100, X_valid=None, y_valid=None):
"""Fit the model to the training set. If X_valid and y_valid are provided, use early stopping."""
self.close_session()
# infer n_inputs and n_outputs from the training set.
n_inputs = X.shape[1]
self.classes_ = np.unique(y)
n_outputs = len(self.classes_)
# Translate the labels vector to a vector of sorted class indices, containing
# integers from 0 to n_outputs - 1.
# For example, if y is equal to [8, 8, 9, 5, 7, 6, 6, 6], then the sorted class
# labels (self.classes_) will be equal to [5, 6, 7, 8, 9], and the labels vector
# will be translated to [3, 3, 4, 0, 2, 1, 1, 1]
self.class_to_index_ = {label: index
for index, label in enumerate(self.classes_)}
y = np.array([self.class_to_index_[label]
for label in y], dtype=np.int32)
self._graph = tf.Graph()
with self._graph.as_default():
self._build_graph(n_inputs, n_outputs)
# extra ops for batch normalization
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
# needed in case of early stopping
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
best_params = None
# Now train the model!
self._session = tf.Session(graph=self._graph)
with self._session.as_default() as sess:
self._init.run()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X))
for rnd_indices in np.array_split(rnd_idx, len(X) // self.batch_size):
X_batch, y_batch = X[rnd_indices], y[rnd_indices]
feed_dict = {self._X: X_batch, self._y: y_batch}
if self._training is not None:
feed_dict[self._training] = True
sess.run(self._training_op, feed_dict=feed_dict)
if extra_update_ops:
sess.run(extra_update_ops, feed_dict=feed_dict)
if X_valid is not None and y_valid is not None:
loss_val, acc_val = sess.run([self._loss, self._accuracy],
feed_dict={self._X: X_valid,
self._y: y_valid})
if loss_val < best_loss:
best_params = self._get_model_params()
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
else:
loss_train, acc_train = sess.run([self._loss, self._accuracy],
feed_dict={self._X: X_batch,
self._y: y_batch})
print("{}\tLast training batch loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_train, acc_train * 100))
# If we used early stopping then rollback to the best model found
if best_params:
self._restore_model_params(best_params)
return self
def predict_proba(self, X):
if not self._session:
raise NotFittedError("This %s instance is not fitted yet" % self.__class__.__name__)
with self._session.as_default() as sess:
return self._Y_proba.eval(feed_dict={self._X: X})
def predict(self, X):
class_indices = np.argmax(self.predict_proba(X), axis=1)
return np.array([[self.classes_[class_index]]
for class_index in class_indices], np.int32)
def save(self, path):
self._saver.save(self._session, path)
Let's see if we get the exact same accuracy as earlier using this class (without dropout or batch norm):
dnn_clf = DNNClassifier(random_state=42)
dnn_clf.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.128663 Best loss: 0.128663 Accuracy: 96.64% 1 Validation loss: 0.448317 Best loss: 0.128663 Accuracy: 78.19% 2 Validation loss: 0.190859 Best loss: 0.128663 Accuracy: 95.54% 3 Validation loss: 0.146951 Best loss: 0.128663 Accuracy: 96.79% 4 Validation loss: 0.086076 Best loss: 0.086076 Accuracy: 97.69% 5 Validation loss: 0.115353 Best loss: 0.086076 Accuracy: 97.77% 6 Validation loss: 0.239142 Best loss: 0.086076 Accuracy: 95.15% 7 Validation loss: 0.088810 Best loss: 0.086076 Accuracy: 98.12% 8 Validation loss: 0.108763 Best loss: 0.086076 Accuracy: 97.81% 9 Validation loss: 0.300808 Best loss: 0.086076 Accuracy: 96.17% 10 Validation loss: 0.179260 Best loss: 0.086076 Accuracy: 97.46% 11 Validation loss: 0.125690 Best loss: 0.086076 Accuracy: 98.48% 12 Validation loss: 0.738371 Best loss: 0.086076 Accuracy: 77.72% 13 Validation loss: 1.894743 Best loss: 0.086076 Accuracy: 78.54% 14 Validation loss: 0.415678 Best loss: 0.086076 Accuracy: 78.50% 15 Validation loss: 0.537646 Best loss: 0.086076 Accuracy: 75.45% 16 Validation loss: 1.009708 Best loss: 0.086076 Accuracy: 53.99% 17 Validation loss: 1.228350 Best loss: 0.086076 Accuracy: 38.15% 18 Validation loss: 1.510606 Best loss: 0.086076 Accuracy: 29.44% 19 Validation loss: 1.632344 Best loss: 0.086076 Accuracy: 22.01% 20 Validation loss: 1.628246 Best loss: 0.086076 Accuracy: 22.01% 21 Validation loss: 1.626765 Best loss: 0.086076 Accuracy: 22.01% 22 Validation loss: 1.651615 Best loss: 0.086076 Accuracy: 18.73% 23 Validation loss: 1.663751 Best loss: 0.086076 Accuracy: 19.27% 24 Validation loss: 1.675138 Best loss: 0.086076 Accuracy: 22.01% 25 Validation loss: 1.743664 Best loss: 0.086076 Accuracy: 18.73% Early stopping!
DNNClassifier(activation=<function elu at 0x7fd9e8a620d0>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The model is trained, let's see if it gets the same accuracy as earlier:
from sklearn.metrics import accuracy_score
y_pred = dnn_clf.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.98054096127651291
Yep! Working fine. Now we can use Scikit-Learn's RandomizedSearchCV
class to search for better hyperparameters (this may take over an hour, depending on your system):
from sklearn.model_selection import RandomizedSearchCV
def leaky_relu(alpha=0.01):
def parametrized_leaky_relu(z, name=None):
return tf.maximum(alpha * z, z, name=name)
return parametrized_leaky_relu
param_distribs = {
"n_neurons": [10, 30, 50, 70, 90, 100, 120, 140, 160],
"batch_size": [10, 50, 100, 500],
"learning_rate": [0.01, 0.02, 0.05, 0.1],
"activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
# you could also try exploring different numbers of hidden layers, different optimizers, etc.
#"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
}
rnd_search = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
fit_params={"X_valid": X_valid1, "y_valid": y_valid1, "n_epochs": 1000},
random_state=42, verbose=2)
rnd_search.fit(X_train1, y_train1)
Fitting 3 folds for each of 50 candidates, totalling 150 fits [CV] n_neurons=10, learning_rate=0.05, activation=<function elu at 0x7fd9e8a620d0>, batch_size=100 0 Validation loss: 0.132355 Best loss: 0.132355 Accuracy: 96.44% 1 Validation loss: 0.126329 Best loss: 0.126329 Accuracy: 96.21% 2 Validation loss: 0.138284 Best loss: 0.126329 Accuracy: 96.76% 3 Validation loss: 0.142094 Best loss: 0.126329 Accuracy: 96.25% 4 Validation loss: 0.128141 Best loss: 0.126329 Accuracy: 96.76% 5 Validation loss: 0.119928 Best loss: 0.119928 Accuracy: 97.26% 6 Validation loss: 0.137134 Best loss: 0.119928 Accuracy: 96.72% 7 Validation loss: 0.156194 Best loss: 0.119928 Accuracy: 96.79% 8 Validation loss: 0.283938 Best loss: 0.119928 Accuracy: 94.53% 9 Validation loss: 1.104801 Best loss: 0.119928 Accuracy: 52.38% 10 Validation loss: 0.966833 Best loss: 0.119928 Accuracy: 53.09% 11 Validation loss: 0.854368 Best loss: 0.119928 Accuracy: 57.47% 12 Validation loss: 1.857330 Best loss: 0.119928 Accuracy: 38.98% 13 Validation loss: 1.642338 Best loss: 0.119928 Accuracy: 18.73% 14 Validation loss: 1.612854 Best loss: 0.119928 Accuracy: 22.01% 15 Validation loss: 1.617682 Best loss: 0.119928 Accuracy: 22.01% 16 Validation loss: 1.616873 Best loss: 0.119928 Accuracy: 22.01% 17 Validation loss: 1.618228 Best loss: 0.119928 Accuracy: 19.27% 18 Validation loss: 1.619055 Best loss: 0.119928 Accuracy: 19.27% 19 Validation loss: 1.643334 Best loss: 0.119928 Accuracy: 19.08% 20 Validation loss: 1.621200 Best loss: 0.119928 Accuracy: 19.08% 21 Validation loss: 1.629823 Best loss: 0.119928 Accuracy: 19.27% 22 Validation loss: 1.624553 Best loss: 0.119928 Accuracy: 18.73% 23 Validation loss: 1.610214 Best loss: 0.119928 Accuracy: 20.91% 24 Validation loss: 1.621143 Best loss: 0.119928 Accuracy: 22.01% 25 Validation loss: 1.623761 Best loss: 0.119928 Accuracy: 22.01% 26 Validation loss: 1.641760 Best loss: 0.119928 Accuracy: 18.73% Early stopping! [CV] n_neurons=10, learning_rate=0.05, activation=<function elu at 0x7fd9e8a620d0>, batch_size=100, total= 5.6s [CV] n_neurons=10, learning_rate=0.05, activation=<function elu at 0x7fd9e8a620d0>, batch_size=100
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 5.6s remaining: 0.0s
0 Validation loss: 0.153707 Best loss: 0.153707 Accuracy: 95.74% 1 Validation loss: 0.120703 Best loss: 0.120703 Accuracy: 96.56% 2 Validation loss: 0.164706 Best loss: 0.120703 Accuracy: 96.05% 3 Validation loss: 0.177875 Best loss: 0.120703 Accuracy: 95.19% 4 Validation loss: 0.171004 Best loss: 0.120703 Accuracy: 95.19% 5 Validation loss: 0.114746 Best loss: 0.114746 Accuracy: 96.83% 6 Validation loss: 0.109637 Best loss: 0.109637 Accuracy: 97.26% 7 Validation loss: 0.261533 Best loss: 0.109637 Accuracy: 94.96% 8 Validation loss: 0.316743 Best loss: 0.109637 Accuracy: 94.02% 9 Validation loss: 0.486484 Best loss: 0.109637 Accuracy: 77.56% 10 Validation loss: 4.635532 Best loss: 0.109637 Accuracy: 53.95% 11 Validation loss: 1.172422 Best loss: 0.109637 Accuracy: 48.36% 12 Validation loss: 1.029865 Best loss: 0.109637 Accuracy: 55.98% 13 Validation loss: 1.298800 Best loss: 0.109637 Accuracy: 36.08% 14 Validation loss: 1.141950 Best loss: 0.109637 Accuracy: 38.08% 15 Validation loss: 1.132486 Best loss: 0.109637 Accuracy: 38.90% 16 Validation loss: 1.078486 Best loss: 0.109637 Accuracy: 45.78% 17 Validation loss: 1.128344 Best loss: 0.109637 Accuracy: 45.07% 18 Validation loss: 1.336244 Best loss: 0.109637 Accuracy: 34.40% 19 Validation loss: 1.199178 Best loss: 0.109637 Accuracy: 39.87% 20 Validation loss: 1.175845 Best loss: 0.109637 Accuracy: 40.11% 21 Validation loss: 1.200430 Best loss: 0.109637 Accuracy: 40.30% 22 Validation loss: 1.390084 Best loss: 0.109637 Accuracy: 34.60% 23 Validation loss: 1.268129 Best loss: 0.109637 Accuracy: 40.23% 24 Validation loss: 1.192210 Best loss: 0.109637 Accuracy: 40.30% 25 Validation loss: 1.190541 Best loss: 0.109637 Accuracy: 41.99% 26 Validation loss: 1.227676 Best loss: 0.109637 Accuracy: 38.62% 27 Validation loss: 1.187587 Best loss: 0.109637 Accuracy: 39.44% Early stopping! [CV] n_neurons=10, learning_rate=0.05, activation=<function elu at 0x7fd9e8a620d0>, batch_size=100, total= 5.9s [CV] n_neurons=10, learning_rate=0.05, activation=<function elu at 0x7fd9e8a620d0>, batch_size=100 0 Validation loss: 0.182619 Best loss: 0.182619 Accuracy: 94.29% 1 Validation loss: 0.152706 Best loss: 0.152706 Accuracy: 95.97% 2 Validation loss: 0.193820 Best loss: 0.152706 Accuracy: 93.82% 3 Validation loss: 0.195413 Best loss: 0.152706 Accuracy: 95.54% 4 Validation loss: 0.171277 Best loss: 0.152706 Accuracy: 95.19% 5 Validation loss: 0.140087 Best loss: 0.140087 Accuracy: 95.70% 6 Validation loss: 0.170798 Best loss: 0.140087 Accuracy: 95.00% 7 Validation loss: 0.163649 Best loss: 0.140087 Accuracy: 96.29% 8 Validation loss: 0.199048 Best loss: 0.140087 Accuracy: 96.09% 9 Validation loss: 1.552870 Best loss: 0.140087 Accuracy: 52.15% 10 Validation loss: 0.813273 Best loss: 0.140087 Accuracy: 60.40% 11 Validation loss: 0.775555 Best loss: 0.140087 Accuracy: 60.67% 12 Validation loss: 0.775275 Best loss: 0.140087 Accuracy: 59.77% 13 Validation loss: 0.770521 Best loss: 0.140087 Accuracy: 59.30% 14 Validation loss: 0.734035 Best loss: 0.140087 Accuracy: 59.85% 15 Validation loss: 0.744980 Best loss: 0.140087 Accuracy: 59.66% 16 Validation loss: 0.785848 Best loss: 0.140087 Accuracy: 59.66% 17 Validation loss: 0.776138 Best loss: 0.140087 Accuracy: 59.42% 18 Validation loss: 0.764496 Best loss: 0.140087 Accuracy: 59.46% 19 Validation loss: 0.763633 Best loss: 0.140087 Accuracy: 59.54% 20 Validation loss: 0.743879 Best loss: 0.140087 Accuracy: 60.75% 21 Validation loss: 0.763295 Best loss: 0.140087 Accuracy: 60.36% 22 Validation loss: 0.717175 Best loss: 0.140087 Accuracy: 60.63% 23 Validation loss: 1.869954 Best loss: 0.140087 Accuracy: 29.28% 24 Validation loss: 1.215518 Best loss: 0.140087 Accuracy: 38.86% 25 Validation loss: 1.196626 Best loss: 0.140087 Accuracy: 38.62% 26 Validation loss: 1.170714 Best loss: 0.140087 Accuracy: 42.38% Early stopping! [CV] n_neurons=10, learning_rate=0.05, activation=<function elu at 0x7fd9e8a620d0>, batch_size=100, total= 6.9s [CV] n_neurons=30, learning_rate=0.02, activation=<function relu at 0x7fd9e8a660d0>, batch_size=500 0 Validation loss: 0.171512 Best loss: 0.171512 Accuracy: 95.07% 1 Validation loss: 0.095914 Best loss: 0.095914 Accuracy: 97.03% 2 Validation loss: 0.099199 Best loss: 0.095914 Accuracy: 96.91% 3 Validation loss: 0.093873 Best loss: 0.093873 Accuracy: 97.15% 4 Validation loss: 0.073461 Best loss: 0.073461 Accuracy: 98.01% 5 Validation loss: 0.084562 Best loss: 0.073461 Accuracy: 97.65% 6 Validation loss: 0.071800 Best loss: 0.071800 Accuracy: 98.01% 7 Validation loss: 0.088435 Best loss: 0.071800 Accuracy: 97.73% 8 Validation loss: 0.082038 Best loss: 0.071800 Accuracy: 97.77% 9 Validation loss: 0.080673 Best loss: 0.071800 Accuracy: 97.69% 10 Validation loss: 0.081036 Best loss: 0.071800 Accuracy: 97.93% 11 Validation loss: 0.092700 Best loss: 0.071800 Accuracy: 97.93% 12 Validation loss: 0.081003 Best loss: 0.071800 Accuracy: 98.20% 13 Validation loss: 0.075607 Best loss: 0.071800 Accuracy: 98.20% 14 Validation loss: 0.092970 Best loss: 0.071800 Accuracy: 98.08% 15 Validation loss: 0.108005 Best loss: 0.071800 Accuracy: 97.77% 16 Validation loss: 0.082602 Best loss: 0.071800 Accuracy: 98.05% 17 Validation loss: 0.114629 Best loss: 0.071800 Accuracy: 97.73% 18 Validation loss: 0.099099 Best loss: 0.071800 Accuracy: 97.69% 19 Validation loss: 0.075535 Best loss: 0.071800 Accuracy: 98.20% 20 Validation loss: 0.102847 Best loss: 0.071800 Accuracy: 98.08% 21 Validation loss: 0.089735 Best loss: 0.071800 Accuracy: 98.36% 22 Validation loss: 0.080781 Best loss: 0.071800 Accuracy: 97.93% 23 Validation loss: 0.073017 Best loss: 0.071800 Accuracy: 98.32% 24 Validation loss: 0.091643 Best loss: 0.071800 Accuracy: 97.93% 25 Validation loss: 0.113891 Best loss: 0.071800 Accuracy: 98.05% 26 Validation loss: 0.094774 Best loss: 0.071800 Accuracy: 98.28% 27 Validation loss: 0.086041 Best loss: 0.071800 Accuracy: 98.20% Early stopping! [CV] n_neurons=30, learning_rate=0.02, activation=<function relu at 0x7fd9e8a660d0>, batch_size=500, total= 6.8s [CV] n_neurons=30, learning_rate=0.02, activation=<function relu at 0x7fd9e8a660d0>, batch_size=500 0 Validation loss: 0.113188 Best loss: 0.113188 Accuracy: 96.60% 1 Validation loss: 0.081384 Best loss: 0.081384 Accuracy: 97.58% 2 Validation loss: 0.068770 Best loss: 0.068770 Accuracy: 98.12% 3 Validation loss: 0.077316 Best loss: 0.068770 Accuracy: 97.73% 4 Validation loss: 0.074333 Best loss: 0.068770 Accuracy: 97.97% 5 Validation loss: 0.084735 Best loss: 0.068770 Accuracy: 97.30% 6 Validation loss: 0.082893 Best loss: 0.068770 Accuracy: 97.69% 7 Validation loss: 0.075860 Best loss: 0.068770 Accuracy: 97.65% 8 Validation loss: 0.078686 Best loss: 0.068770 Accuracy: 97.77% 9 Validation loss: 0.080869 Best loss: 0.068770 Accuracy: 97.77% 10 Validation loss: 0.082026 Best loss: 0.068770 Accuracy: 98.12% 11 Validation loss: 0.086516 Best loss: 0.068770 Accuracy: 97.69% 12 Validation loss: 0.076660 Best loss: 0.068770 Accuracy: 98.12% 13 Validation loss: 0.073815 Best loss: 0.068770 Accuracy: 98.08% 14 Validation loss: 0.077873 Best loss: 0.068770 Accuracy: 98.20% 15 Validation loss: 0.078704 Best loss: 0.068770 Accuracy: 97.93% 16 Validation loss: 0.077061 Best loss: 0.068770 Accuracy: 98.28% 17 Validation loss: 0.075423 Best loss: 0.068770 Accuracy: 97.93% 18 Validation loss: 0.085646 Best loss: 0.068770 Accuracy: 98.24% 19 Validation loss: 0.082202 Best loss: 0.068770 Accuracy: 98.05% 20 Validation loss: 0.103338 Best loss: 0.068770 Accuracy: 97.46% 21 Validation loss: 0.068182 Best loss: 0.068182 Accuracy: 98.40% 22 Validation loss: 0.067592 Best loss: 0.067592 Accuracy: 97.93% 23 Validation loss: 0.076756 Best loss: 0.067592 Accuracy: 98.28% 24 Validation loss: 0.072327 Best loss: 0.067592 Accuracy: 98.48% 25 Validation loss: 0.075613 Best loss: 0.067592 Accuracy: 98.44% 26 Validation loss: 0.072291 Best loss: 0.067592 Accuracy: 98.40% 27 Validation loss: 0.084550 Best loss: 0.067592 Accuracy: 98.28% 28 Validation loss: 0.075566 Best loss: 0.067592 Accuracy: 98.36% 29 Validation loss: 0.071688 Best loss: 0.067592 Accuracy: 98.28% 30 Validation loss: 0.075556 Best loss: 0.067592 Accuracy: 98.24% 31 Validation loss: 0.065671 Best loss: 0.065671 Accuracy: 98.40% 32 Validation loss: 0.083471 Best loss: 0.065671 Accuracy: 98.40% 33 Validation loss: 0.086415 Best loss: 0.065671 Accuracy: 98.59% 34 Validation loss: 0.085613 Best loss: 0.065671 Accuracy: 98.36% 35 Validation loss: 0.099534 Best loss: 0.065671 Accuracy: 98.28% 36 Validation loss: 0.102709 Best loss: 0.065671 Accuracy: 98.32% 37 Validation loss: 0.093125 Best loss: 0.065671 Accuracy: 98.20% 38 Validation loss: 0.109501 Best loss: 0.065671 Accuracy: 97.85% 39 Validation loss: 0.109443 Best loss: 0.065671 Accuracy: 98.44% 40 Validation loss: 0.087260 Best loss: 0.065671 Accuracy: 98.36% 41 Validation loss: 0.106365 Best loss: 0.065671 Accuracy: 98.36% 42 Validation loss: 0.102789 Best loss: 0.065671 Accuracy: 98.05% 43 Validation loss: 0.094281 Best loss: 0.065671 Accuracy: 98.48% 44 Validation loss: 0.094514 Best loss: 0.065671 Accuracy: 98.40% [...and much later...] 20 Validation loss: 0.046808 Best loss: 0.033867 Accuracy: 98.83% 21 Validation loss: 0.052966 Best loss: 0.033867 Accuracy: 98.91% 22 Validation loss: 0.095892 Best loss: 0.033867 Accuracy: 98.08% 23 Validation loss: 0.054250 Best loss: 0.033867 Accuracy: 98.87% 24 Validation loss: 0.061026 Best loss: 0.033867 Accuracy: 98.87% 25 Validation loss: 0.081977 Best loss: 0.033867 Accuracy: 98.67% 26 Validation loss: 0.079819 Best loss: 0.033867 Accuracy: 98.71% 27 Validation loss: 0.059824 Best loss: 0.033867 Accuracy: 98.75% 28 Validation loss: 0.057758 Best loss: 0.033867 Accuracy: 98.94% 29 Validation loss: 0.087165 Best loss: 0.033867 Accuracy: 98.91% 30 Validation loss: 0.052274 Best loss: 0.033867 Accuracy: 99.10% 31 Validation loss: 0.059831 Best loss: 0.033867 Accuracy: 98.79% 32 Validation loss: 0.054240 Best loss: 0.033867 Accuracy: 98.91% 33 Validation loss: 0.048165 Best loss: 0.033867 Accuracy: 98.94% 34 Validation loss: 0.040565 Best loss: 0.033867 Accuracy: 99.18% 35 Validation loss: 0.103207 Best loss: 0.033867 Accuracy: 98.28% 36 Validation loss: 400.716797 Best loss: 0.033867 Accuracy: 71.46% 37 Validation loss: 11.996887 Best loss: 0.033867 Accuracy: 96.09% 38 Validation loss: 2.623182 Best loss: 0.033867 Accuracy: 96.56% 39 Validation loss: 1.344962 Best loss: 0.033867 Accuracy: 97.69% 40 Validation loss: 1.125381 Best loss: 0.033867 Accuracy: 97.42% Early stopping!
RandomizedSearchCV(cv=None, error_score='raise', estimator=DNNClassifier(activation=<function elu at 0x7fd9e8a620d0>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42), fit_params={'y_valid': array([0, 4, ..., 1, 2], dtype=uint8), 'X_valid': array([[ 0., 0., ..., 0., 0.], [ 0., 0., ..., 0., 0.], ..., [ 0., 0., ..., 0., 0.], [ 0., 0., ..., 0., 0.]], dtype=float32), 'n_epochs': 1000}, iid=True, n_iter=50, n_jobs=1, param_distributions={'n_neurons': [10, 30, 50, 70, 90, 100, 120, 140, 160], 'learning_rate': [0.01, 0.02, 0.05, 0.1], 'activation': [<function relu at 0x7fd9e8a660d0>, <function elu at 0x7fd9e8a620d0>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9db0b30d0>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9d4ddca60>], 'batch_size': [10, 50, 100, 500]}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score=True, scoring=None, verbose=2)
rnd_search.best_params_
{'activation': <function __main__.leaky_relu.<locals>.parametrized_leaky_relu>, 'batch_size': 500, 'learning_rate': 0.01, 'n_neurons': 140}
y_pred = rnd_search.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.99318933644677954
Wonderful! Tuning the hyperparameters got us up to 99.32% accuracy! It may not sound like a great improvement to go from 98.05% to 99.32% accuracy, but consider the error rate: it went from roughly 2% to 0.7%. That's a 65% reduction of the number of errors this model will produce!
It's a good idea to save this model:
rnd_search.best_estimator_.save("./my_best_mnist_model_0_to_4")
Exercise: Now try adding Batch Normalization and compare the learning curves: is it converging faster than before? Does it produce a better model?
Let's train the best model found, once again, to see how fast it converges (alternatively, you could tweak the code above to make it write summaries for TensorBoard, so you can visualize the learning curve):
dnn_clf = DNNClassifier(activation=leaky_relu(alpha=0.1), batch_size=500, learning_rate=0.01,
n_neurons=140, random_state=42)
dnn_clf.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.090732 Best loss: 0.090732 Accuracy: 97.22% 1 Validation loss: 0.052198 Best loss: 0.052198 Accuracy: 98.40% 2 Validation loss: 0.040040 Best loss: 0.040040 Accuracy: 98.94% 3 Validation loss: 0.057495 Best loss: 0.040040 Accuracy: 98.55% 4 Validation loss: 0.045600 Best loss: 0.040040 Accuracy: 98.75% 5 Validation loss: 0.062344 Best loss: 0.040040 Accuracy: 98.48% 6 Validation loss: 0.048719 Best loss: 0.040040 Accuracy: 98.67% 7 Validation loss: 0.050346 Best loss: 0.040040 Accuracy: 98.79% 8 Validation loss: 0.051224 Best loss: 0.040040 Accuracy: 98.79% 9 Validation loss: 0.036505 Best loss: 0.036505 Accuracy: 98.98% 10 Validation loss: 0.052532 Best loss: 0.036505 Accuracy: 98.71% 11 Validation loss: 0.057086 Best loss: 0.036505 Accuracy: 99.10% 12 Validation loss: 0.036754 Best loss: 0.036505 Accuracy: 99.06% 13 Validation loss: 0.046782 Best loss: 0.036505 Accuracy: 98.87% 14 Validation loss: 0.048929 Best loss: 0.036505 Accuracy: 98.91% 15 Validation loss: 0.052919 Best loss: 0.036505 Accuracy: 98.75% 16 Validation loss: 0.054287 Best loss: 0.036505 Accuracy: 98.67% 17 Validation loss: 0.047722 Best loss: 0.036505 Accuracy: 98.79% 18 Validation loss: 0.040474 Best loss: 0.036505 Accuracy: 99.14% 19 Validation loss: 0.033867 Best loss: 0.033867 Accuracy: 99.14% 20 Validation loss: 0.046808 Best loss: 0.033867 Accuracy: 98.83% 21 Validation loss: 0.052966 Best loss: 0.033867 Accuracy: 98.91% 22 Validation loss: 0.095892 Best loss: 0.033867 Accuracy: 98.08% 23 Validation loss: 0.054250 Best loss: 0.033867 Accuracy: 98.87% 24 Validation loss: 0.061026 Best loss: 0.033867 Accuracy: 98.87% 25 Validation loss: 0.081977 Best loss: 0.033867 Accuracy: 98.67% 26 Validation loss: 0.079819 Best loss: 0.033867 Accuracy: 98.71% 27 Validation loss: 0.059824 Best loss: 0.033867 Accuracy: 98.75% 28 Validation loss: 0.057758 Best loss: 0.033867 Accuracy: 98.94% 29 Validation loss: 0.087165 Best loss: 0.033867 Accuracy: 98.91% 30 Validation loss: 0.052274 Best loss: 0.033867 Accuracy: 99.10% 31 Validation loss: 0.059831 Best loss: 0.033867 Accuracy: 98.79% 32 Validation loss: 0.054240 Best loss: 0.033867 Accuracy: 98.91% 33 Validation loss: 0.048165 Best loss: 0.033867 Accuracy: 98.94% 34 Validation loss: 0.040565 Best loss: 0.033867 Accuracy: 99.18% 35 Validation loss: 0.103207 Best loss: 0.033867 Accuracy: 98.28% 36 Validation loss: 400.716797 Best loss: 0.033867 Accuracy: 71.46% 37 Validation loss: 11.996887 Best loss: 0.033867 Accuracy: 96.09% 38 Validation loss: 2.623182 Best loss: 0.033867 Accuracy: 96.56% 39 Validation loss: 1.344962 Best loss: 0.033867 Accuracy: 97.69% 40 Validation loss: 1.125381 Best loss: 0.033867 Accuracy: 97.42% Early stopping!
DNNClassifier(activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9d19e37b8>, batch_norm_momentum=None, batch_size=500, dropout_rate=None, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=5, n_neurons=140, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The best loss is reached at epoch 19, but it was already within 10% of that result at epoch 9.
Let's check that we do indeed get 99.32% accuracy on the test set:
y_pred = dnn_clf.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.99318933644677954
Good, now let's use the exact same model, but this time with batch normalization:
dnn_clf_bn = DNNClassifier(activation=leaky_relu(alpha=0.1), batch_size=500, learning_rate=0.01,
n_neurons=90, random_state=42,
batch_norm_momentum=0.95)
dnn_clf_bn.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.046053 Best loss: 0.046053 Accuracy: 98.67% 1 Validation loss: 0.032228 Best loss: 0.032228 Accuracy: 98.83% 2 Validation loss: 0.032974 Best loss: 0.032228 Accuracy: 98.83% 3 Validation loss: 0.035961 Best loss: 0.032228 Accuracy: 98.94% 4 Validation loss: 0.040250 Best loss: 0.032228 Accuracy: 98.94% 5 Validation loss: 0.033051 Best loss: 0.032228 Accuracy: 99.06% 6 Validation loss: 0.056053 Best loss: 0.032228 Accuracy: 98.32% 7 Validation loss: 0.031729 Best loss: 0.031729 Accuracy: 99.18% 8 Validation loss: 0.027662 Best loss: 0.027662 Accuracy: 99.26% 9 Validation loss: 0.034074 Best loss: 0.027662 Accuracy: 98.94% 10 Validation loss: 0.032173 Best loss: 0.027662 Accuracy: 99.06% 11 Validation loss: 0.030538 Best loss: 0.027662 Accuracy: 99.10% 12 Validation loss: 0.030337 Best loss: 0.027662 Accuracy: 99.10% 13 Validation loss: 0.022219 Best loss: 0.022219 Accuracy: 99.45% 14 Validation loss: 0.036824 Best loss: 0.022219 Accuracy: 99.14% 15 Validation loss: 0.033945 Best loss: 0.022219 Accuracy: 99.18% 16 Validation loss: 0.032533 Best loss: 0.022219 Accuracy: 98.98% 17 Validation loss: 0.037204 Best loss: 0.022219 Accuracy: 99.02% 18 Validation loss: 0.026982 Best loss: 0.022219 Accuracy: 99.34% 19 Validation loss: 0.022094 Best loss: 0.022094 Accuracy: 99.53% 20 Validation loss: 0.026196 Best loss: 0.022094 Accuracy: 99.26% 21 Validation loss: 0.022107 Best loss: 0.022094 Accuracy: 99.49% 22 Validation loss: 0.021436 Best loss: 0.021436 Accuracy: 99.53% 23 Validation loss: 0.025607 Best loss: 0.021436 Accuracy: 99.37% 24 Validation loss: 0.038882 Best loss: 0.021436 Accuracy: 99.22% 25 Validation loss: 0.032011 Best loss: 0.021436 Accuracy: 99.26% 26 Validation loss: 0.027673 Best loss: 0.021436 Accuracy: 99.22% 27 Validation loss: 0.026874 Best loss: 0.021436 Accuracy: 99.30% 28 Validation loss: 0.021123 Best loss: 0.021123 Accuracy: 99.41% 29 Validation loss: 0.024784 Best loss: 0.021123 Accuracy: 99.45% 30 Validation loss: 0.024108 Best loss: 0.021123 Accuracy: 99.49% 31 Validation loss: 0.028439 Best loss: 0.021123 Accuracy: 99.37% 32 Validation loss: 0.032366 Best loss: 0.021123 Accuracy: 99.22% 33 Validation loss: 0.037057 Best loss: 0.021123 Accuracy: 99.18% 34 Validation loss: 0.042305 Best loss: 0.021123 Accuracy: 98.98% 35 Validation loss: 0.039662 Best loss: 0.021123 Accuracy: 99.14% 36 Validation loss: 0.036299 Best loss: 0.021123 Accuracy: 99.14% 37 Validation loss: 0.026997 Best loss: 0.021123 Accuracy: 99.53% 38 Validation loss: 0.034407 Best loss: 0.021123 Accuracy: 99.22% 39 Validation loss: 0.027668 Best loss: 0.021123 Accuracy: 99.41% 40 Validation loss: 0.029128 Best loss: 0.021123 Accuracy: 99.30% 41 Validation loss: 0.033564 Best loss: 0.021123 Accuracy: 99.14% 42 Validation loss: 0.033810 Best loss: 0.021123 Accuracy: 99.30% 43 Validation loss: 0.044953 Best loss: 0.021123 Accuracy: 98.98% 44 Validation loss: 0.026280 Best loss: 0.021123 Accuracy: 99.26% 45 Validation loss: 0.020275 Best loss: 0.020275 Accuracy: 99.61% 46 Validation loss: 0.018810 Best loss: 0.018810 Accuracy: 99.45% 47 Validation loss: 0.027529 Best loss: 0.018810 Accuracy: 99.18% 48 Validation loss: 0.018120 Best loss: 0.018120 Accuracy: 99.53% 49 Validation loss: 0.019378 Best loss: 0.018120 Accuracy: 99.45% 50 Validation loss: 0.029760 Best loss: 0.018120 Accuracy: 99.34% 51 Validation loss: 0.035702 Best loss: 0.018120 Accuracy: 99.26% 52 Validation loss: 0.032662 Best loss: 0.018120 Accuracy: 99.02% 53 Validation loss: 0.026943 Best loss: 0.018120 Accuracy: 99.37% 54 Validation loss: 0.029007 Best loss: 0.018120 Accuracy: 99.53% 55 Validation loss: 0.021956 Best loss: 0.018120 Accuracy: 99.49% 56 Validation loss: 0.018983 Best loss: 0.018120 Accuracy: 99.61% 57 Validation loss: 0.022788 Best loss: 0.018120 Accuracy: 99.49% 58 Validation loss: 0.019578 Best loss: 0.018120 Accuracy: 99.61% 59 Validation loss: 0.021676 Best loss: 0.018120 Accuracy: 99.61% 60 Validation loss: 0.021580 Best loss: 0.018120 Accuracy: 99.65% 61 Validation loss: 0.021467 Best loss: 0.018120 Accuracy: 99.65% 62 Validation loss: 0.020513 Best loss: 0.018120 Accuracy: 99.65% 63 Validation loss: 0.020252 Best loss: 0.018120 Accuracy: 99.65% 64 Validation loss: 0.021724 Best loss: 0.018120 Accuracy: 99.65% 65 Validation loss: 0.021499 Best loss: 0.018120 Accuracy: 99.69% 66 Validation loss: 0.021627 Best loss: 0.018120 Accuracy: 99.69% 67 Validation loss: 0.021569 Best loss: 0.018120 Accuracy: 99.69% 68 Validation loss: 0.021727 Best loss: 0.018120 Accuracy: 99.69% 69 Validation loss: 0.021104 Best loss: 0.018120 Accuracy: 99.69% Early stopping!
DNNClassifier(activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9d19e3c80>, batch_norm_momentum=0.95, batch_size=500, dropout_rate=None, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=5, n_neurons=90, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The best params are reached during epoch 48, that's actually a slower convergence than earlier. Let's check the accuracy:
y_pred = dnn_clf_bn.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.99241097489784003
Well, batch normalization did not improve accuracy. Let's see if we can find a good set of hyperparameters that will work well with batch normalization:
from sklearn.model_selection import RandomizedSearchCV
param_distribs = {
"n_neurons": [10, 30, 50, 70, 90, 100, 120, 140, 160],
"batch_size": [10, 50, 100, 500],
"learning_rate": [0.01, 0.02, 0.05, 0.1],
"activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
# you could also try exploring different numbers of hidden layers, different optimizers, etc.
#"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
"batch_norm_momentum": [0.9, 0.95, 0.98, 0.99, 0.999],
}
rnd_search_bn = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
fit_params={"X_valid": X_valid1, "y_valid": y_valid1, "n_epochs": 1000},
random_state=42, verbose=2)
rnd_search_bn.fit(X_train1, y_train1)
Fitting 3 folds for each of 50 candidates, totalling 150 fits [CV] activation=<function relu at 0x7fd9e8a660d0>, n_neurons=70, learning_rate=0.01, batch_norm_momentum=0.99, batch_size=50 0 Validation loss: 0.113224 Best loss: 0.113224 Accuracy: 97.30% 1 Validation loss: 0.064190 Best loss: 0.064190 Accuracy: 98.24% 2 Validation loss: 0.080173 Best loss: 0.064190 Accuracy: 98.28% 3 Validation loss: 0.059603 Best loss: 0.059603 Accuracy: 98.28% 4 Validation loss: 0.043533 Best loss: 0.043533 Accuracy: 98.48% 5 Validation loss: 0.040107 Best loss: 0.040107 Accuracy: 98.87% 6 Validation loss: 0.051212 Best loss: 0.040107 Accuracy: 98.24% 7 Validation loss: 0.046029 Best loss: 0.040107 Accuracy: 98.71% 8 Validation loss: 0.053079 Best loss: 0.040107 Accuracy: 98.59% 9 Validation loss: 0.066891 Best loss: 0.040107 Accuracy: 98.28% 10 Validation loss: 0.037712 Best loss: 0.037712 Accuracy: 98.83% 11 Validation loss: 0.055569 Best loss: 0.037712 Accuracy: 98.55% 12 Validation loss: 0.040949 Best loss: 0.037712 Accuracy: 98.98% 13 Validation loss: 0.077433 Best loss: 0.037712 Accuracy: 98.36% 14 Validation loss: 0.065955 Best loss: 0.037712 Accuracy: 98.63% 15 Validation loss: 0.038968 Best loss: 0.037712 Accuracy: 99.02% 16 Validation loss: 0.039190 Best loss: 0.037712 Accuracy: 99.06% 17 Validation loss: 0.050690 Best loss: 0.037712 Accuracy: 98.71% 18 Validation loss: 0.043054 Best loss: 0.037712 Accuracy: 99.02% 19 Validation loss: 0.063156 Best loss: 0.037712 Accuracy: 98.71% 20 Validation loss: 0.043066 Best loss: 0.037712 Accuracy: 99.14% 21 Validation loss: 0.058145 Best loss: 0.037712 Accuracy: 98.79% 22 Validation loss: 0.039590 Best loss: 0.037712 Accuracy: 99.06% 23 Validation loss: 0.049981 Best loss: 0.037712 Accuracy: 98.75% 24 Validation loss: 0.047458 Best loss: 0.037712 Accuracy: 99.10% 25 Validation loss: 0.040638 Best loss: 0.037712 Accuracy: 99.06% 26 Validation loss: 0.041426 Best loss: 0.037712 Accuracy: 98.98% 27 Validation loss: 0.041325 Best loss: 0.037712 Accuracy: 98.98% 28 Validation loss: 0.054609 Best loss: 0.037712 Accuracy: 98.91% 29 Validation loss: 0.067671 Best loss: 0.037712 Accuracy: 98.75% 30 Validation loss: 0.037608 Best loss: 0.037608 Accuracy: 98.79% 31 Validation loss: 0.047441 Best loss: 0.037608 Accuracy: 98.98% 32 Validation loss: 0.053716 Best loss: 0.037608 Accuracy: 99.02% 33 Validation loss: 0.045445 Best loss: 0.037608 Accuracy: 98.83% 34 Validation loss: 0.046023 Best loss: 0.037608 Accuracy: 98.94% 35 Validation loss: 0.050073 Best loss: 0.037608 Accuracy: 98.91% 36 Validation loss: 0.051887 Best loss: 0.037608 Accuracy: 98.87% 37 Validation loss: 0.050272 Best loss: 0.037608 Accuracy: 99.02% 38 Validation loss: 0.043531 Best loss: 0.037608 Accuracy: 99.10% 39 Validation loss: 0.054661 Best loss: 0.037608 Accuracy: 98.87% 40 Validation loss: 0.047607 Best loss: 0.037608 Accuracy: 98.87% 41 Validation loss: 0.051862 Best loss: 0.037608 Accuracy: 99.14% 42 Validation loss: 0.044218 Best loss: 0.037608 Accuracy: 99.14% 43 Validation loss: 0.043707 Best loss: 0.037608 Accuracy: 99.06% 44 Validation loss: 0.039602 Best loss: 0.037608 Accuracy: 99.06% 45 Validation loss: 0.048998 Best loss: 0.037608 Accuracy: 99.02% 46 Validation loss: 0.045562 Best loss: 0.037608 Accuracy: 99.14% 47 Validation loss: 0.042198 Best loss: 0.037608 Accuracy: 99.10% 48 Validation loss: 0.027679 Best loss: 0.027679 Accuracy: 99.10% 49 Validation loss: 0.033783 Best loss: 0.027679 Accuracy: 98.94% 50 Validation loss: 0.032935 Best loss: 0.027679 Accuracy: 99.41% 51 Validation loss: 0.042930 Best loss: 0.027679 Accuracy: 98.98% 52 Validation loss: 0.045454 Best loss: 0.027679 Accuracy: 99.06% 53 Validation loss: 0.047336 Best loss: 0.027679 Accuracy: 98.91% 54 Validation loss: 0.036523 Best loss: 0.027679 Accuracy: 99.14% 55 Validation loss: 0.064401 Best loss: 0.027679 Accuracy: 98.94% 56 Validation loss: 0.047686 Best loss: 0.027679 Accuracy: 98.83% 57 Validation loss: 0.049083 Best loss: 0.027679 Accuracy: 98.98% 58 Validation loss: 0.057310 Best loss: 0.027679 Accuracy: 99.10% 59 Validation loss: 0.043757 Best loss: 0.027679 Accuracy: 99.14% 60 Validation loss: 0.058742 Best loss: 0.027679 Accuracy: 99.02% 61 Validation loss: 0.055049 Best loss: 0.027679 Accuracy: 99.06% 62 Validation loss: 0.039837 Best loss: 0.027679 Accuracy: 99.18% 63 Validation loss: 0.057108 Best loss: 0.027679 Accuracy: 99.06% 64 Validation loss: 0.043212 Best loss: 0.027679 Accuracy: 98.98% 65 Validation loss: 0.046874 Best loss: 0.027679 Accuracy: 99.18% 66 Validation loss: 0.052819 Best loss: 0.027679 Accuracy: 99.10% 67 Validation loss: 0.045977 Best loss: 0.027679 Accuracy: 99.14% 68 Validation loss: 0.053290 Best loss: 0.027679 Accuracy: 99.10% 69 Validation loss: 0.052941 Best loss: 0.027679 Accuracy: 99.06% Early stopping! [CV] activation=<function relu at 0x7fd9e8a660d0>, n_neurons=70, learning_rate=0.01, batch_norm_momentum=0.99, batch_size=50, total= 2.7min [CV] activation=<function relu at 0x7fd9e8a660d0>, n_neurons=70, learning_rate=0.01, batch_norm_momentum=0.99, batch_size=50
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 2.7min remaining: 0.0s
0 Validation loss: 0.144984 Best loss: 0.144984 Accuracy: 96.40% 1 Validation loss: 0.067873 Best loss: 0.067873 Accuracy: 98.44% 2 Validation loss: 0.091854 Best loss: 0.067873 Accuracy: 97.30% 3 Validation loss: 0.074647 Best loss: 0.067873 Accuracy: 98.05% 4 Validation loss: 0.053722 Best loss: 0.053722 Accuracy: 98.48% 5 Validation loss: 0.049216 Best loss: 0.049216 Accuracy: 98.44% 6 Validation loss: 0.057619 Best loss: 0.049216 Accuracy: 98.48% 7 Validation loss: 0.045842 Best loss: 0.045842 Accuracy: 98.75% 8 Validation loss: 0.042398 Best loss: 0.042398 Accuracy: 98.63% 9 Validation loss: 0.052629 Best loss: 0.042398 Accuracy: 98.63% 10 Validation loss: 0.056892 Best loss: 0.042398 Accuracy: 98.63% 11 Validation loss: 0.051838 Best loss: 0.042398 Accuracy: 98.75% 12 Validation loss: 0.042647 Best loss: 0.042398 Accuracy: 98.67% 13 Validation loss: 0.061297 Best loss: 0.042398 Accuracy: 98.59% 14 Validation loss: 0.049706 Best loss: 0.042398 Accuracy: 98.87% 15 Validation loss: 0.061934 Best loss: 0.042398 Accuracy: 98.79% 16 Validation loss: 0.049027 Best loss: 0.042398 Accuracy: 98.87% 17 Validation loss: 0.052187 Best loss: 0.042398 Accuracy: 98.79% 18 Validation loss: 0.052031 Best loss: 0.042398 Accuracy: 98.94% [...and much later...] 13 Validation loss: 0.043686 Best loss: 0.040332 Accuracy: 99.02% 14 Validation loss: 0.046940 Best loss: 0.040332 Accuracy: 99.18% 15 Validation loss: 0.045355 Best loss: 0.040332 Accuracy: 99.14% 16 Validation loss: 0.084697 Best loss: 0.040332 Accuracy: 98.87% 17 Validation loss: 0.123538 Best loss: 0.040332 Accuracy: 97.81% 18 Validation loss: 0.296928 Best loss: 0.040332 Accuracy: 97.50% 19 Validation loss: 0.053660 Best loss: 0.040332 Accuracy: 98.91% 20 Validation loss: 0.045684 Best loss: 0.040332 Accuracy: 98.94% 21 Validation loss: 0.051971 Best loss: 0.040332 Accuracy: 99.14% 22 Validation loss: 0.071830 Best loss: 0.040332 Accuracy: 99.06% 23 Validation loss: 0.069619 Best loss: 0.040332 Accuracy: 98.79% 24 Validation loss: 0.086642 Best loss: 0.040332 Accuracy: 98.71% 25 Validation loss: 0.072563 Best loss: 0.040332 Accuracy: 98.83% 26 Validation loss: 0.058974 Best loss: 0.040332 Accuracy: 99.06% 27 Validation loss: 0.048388 Best loss: 0.040332 Accuracy: 98.98% 28 Validation loss: 0.054847 Best loss: 0.040332 Accuracy: 99.06% 29 Validation loss: 0.077242 Best loss: 0.040332 Accuracy: 98.91% 30 Validation loss: 0.556978 Best loss: 0.040332 Accuracy: 95.54% Early stopping! [CV] activation=<function elu at 0x7fd9e8a620d0>, n_neurons=140, learning_rate=0.05, batch_norm_momentum=0.99, batch_size=50, total= 1.9min
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 355.8min finished
0 Validation loss: 0.076371 Best loss: 0.076371 Accuracy: 97.85% 1 Validation loss: 0.049312 Best loss: 0.049312 Accuracy: 98.63% 2 Validation loss: 0.033071 Best loss: 0.033071 Accuracy: 98.94% 3 Validation loss: 0.027357 Best loss: 0.027357 Accuracy: 99.10% 4 Validation loss: 0.028748 Best loss: 0.027357 Accuracy: 99.26% 5 Validation loss: 0.036602 Best loss: 0.027357 Accuracy: 98.94% 6 Validation loss: 0.048089 Best loss: 0.027357 Accuracy: 98.94% 7 Validation loss: 0.030332 Best loss: 0.027357 Accuracy: 99.30% 8 Validation loss: 0.029336 Best loss: 0.027357 Accuracy: 99.22% 9 Validation loss: 0.033328 Best loss: 0.027357 Accuracy: 99.26% 10 Validation loss: 0.041745 Best loss: 0.027357 Accuracy: 98.98% 11 Validation loss: 0.048739 Best loss: 0.027357 Accuracy: 98.75% 12 Validation loss: 0.049520 Best loss: 0.027357 Accuracy: 98.94% 13 Validation loss: 0.034222 Best loss: 0.027357 Accuracy: 99.18% 14 Validation loss: 0.040270 Best loss: 0.027357 Accuracy: 99.34% 15 Validation loss: 0.033074 Best loss: 0.027357 Accuracy: 99.37% 16 Validation loss: 0.035130 Best loss: 0.027357 Accuracy: 99.06% 17 Validation loss: 0.031875 Best loss: 0.027357 Accuracy: 99.18% 18 Validation loss: 0.034898 Best loss: 0.027357 Accuracy: 99.37% 19 Validation loss: 0.019222 Best loss: 0.019222 Accuracy: 99.53% 20 Validation loss: 0.043814 Best loss: 0.019222 Accuracy: 99.37% 21 Validation loss: 0.028773 Best loss: 0.019222 Accuracy: 99.34% 22 Validation loss: 0.024850 Best loss: 0.019222 Accuracy: 99.45% 23 Validation loss: 0.021789 Best loss: 0.019222 Accuracy: 99.45% 24 Validation loss: 0.028846 Best loss: 0.019222 Accuracy: 99.37% 25 Validation loss: 0.064211 Best loss: 0.019222 Accuracy: 98.98% 26 Validation loss: 0.024425 Best loss: 0.019222 Accuracy: 99.49% 27 Validation loss: 0.035453 Best loss: 0.019222 Accuracy: 99.22% 28 Validation loss: 0.023940 Best loss: 0.019222 Accuracy: 99.37% 29 Validation loss: 0.041495 Best loss: 0.019222 Accuracy: 99.18% 30 Validation loss: 0.028030 Best loss: 0.019222 Accuracy: 99.37% 31 Validation loss: 0.028003 Best loss: 0.019222 Accuracy: 99.49% 32 Validation loss: 0.026579 Best loss: 0.019222 Accuracy: 99.45% 33 Validation loss: 0.037838 Best loss: 0.019222 Accuracy: 98.91% 34 Validation loss: 0.026082 Best loss: 0.019222 Accuracy: 99.49% 35 Validation loss: 0.031529 Best loss: 0.019222 Accuracy: 99.34% 36 Validation loss: 0.028220 Best loss: 0.019222 Accuracy: 99.18% 37 Validation loss: 0.038546 Best loss: 0.019222 Accuracy: 99.10% 38 Validation loss: 0.041586 Best loss: 0.019222 Accuracy: 98.75% 39 Validation loss: 0.038835 Best loss: 0.019222 Accuracy: 99.41% 40 Validation loss: 0.042555 Best loss: 0.019222 Accuracy: 99.14% Early stopping!
RandomizedSearchCV(cv=None, error_score='raise', estimator=DNNClassifier(activation=<function elu at 0x7fd9e8a620d0>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42), fit_params={'y_valid': array([0, 4, ..., 1, 2], dtype=uint8), 'X_valid': array([[ 0., 0., ..., 0., 0.], [ 0., 0., ..., 0., 0.], ..., [ 0., 0., ..., 0., 0.], [ 0., 0., ..., 0., 0.]], dtype=float32), 'n_epochs': 1000}, iid=True, n_iter=50, n_jobs=1, param_distributions={'batch_norm_momentum': [0.9, 0.95, 0.98, 0.99, 0.999], 'n_neurons': [10, 30, 50, 70, 90, 100, 120, 140, 160], 'learning_rate': [0.01, 0.02, 0.05, 0.1], 'activation': [<function relu at 0x7fd9e8a660d0>, <function elu at 0x7fd9e8a620d0>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9d19e3bf8>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9d19e3a60>], 'batch_size': [10, 50, 100, 500]}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score=True, scoring=None, verbose=2)
rnd_search_bn.best_params_
{'activation': <function tensorflow.python.ops.gen_nn_ops.relu>, 'batch_norm_momentum': 0.98, 'batch_size': 100, 'learning_rate': 0.01, 'n_neurons': 160}
y_pred = rnd_search_bn.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.99396769799571905
Slightly better than earlier: 99.4% vs 99.3%. Let's see if dropout can do better.
Exercise: is the model overfitting the training set? Try adding dropout to every layer and try again. Does it help?
Let's go back to the best model we trained earlier and see how it performs on the training set:
y_pred = dnn_clf.predict(X_train1)
accuracy_score(y_train1, y_pred)
0.99914401883158566
The model performs significantly better on the training set than on the test set (99.91% vs 99.32%), which means it is overfitting the training set. A bit of regularization may help. Let's try adding dropout with a 50% dropout rate:
dnn_clf_dropout = DNNClassifier(activation=leaky_relu(alpha=0.1), batch_size=500, learning_rate=0.01,
n_neurons=90, random_state=42,
dropout_rate=0.5)
dnn_clf_dropout.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.162759 Best loss: 0.162759 Accuracy: 95.15% 1 Validation loss: 0.120510 Best loss: 0.120510 Accuracy: 96.64% 2 Validation loss: 0.110715 Best loss: 0.110715 Accuracy: 96.91% 3 Validation loss: 0.104193 Best loss: 0.104193 Accuracy: 97.22% 4 Validation loss: 0.103560 Best loss: 0.103560 Accuracy: 97.81% 5 Validation loss: 0.087045 Best loss: 0.087045 Accuracy: 97.89% 6 Validation loss: 0.087227 Best loss: 0.087045 Accuracy: 97.65% 7 Validation loss: 0.079840 Best loss: 0.079840 Accuracy: 98.16% 8 Validation loss: 0.083102 Best loss: 0.079840 Accuracy: 97.50% 9 Validation loss: 0.076794 Best loss: 0.076794 Accuracy: 98.01% 10 Validation loss: 0.074914 Best loss: 0.074914 Accuracy: 97.93% 11 Validation loss: 0.073794 Best loss: 0.073794 Accuracy: 98.12% 12 Validation loss: 0.079777 Best loss: 0.073794 Accuracy: 97.89% 13 Validation loss: 0.080277 Best loss: 0.073794 Accuracy: 97.54% 14 Validation loss: 0.072409 Best loss: 0.072409 Accuracy: 98.08% 15 Validation loss: 0.071988 Best loss: 0.071988 Accuracy: 98.12% 16 Validation loss: 0.074609 Best loss: 0.071988 Accuracy: 97.93% 17 Validation loss: 0.069488 Best loss: 0.069488 Accuracy: 98.28% 18 Validation loss: 0.080863 Best loss: 0.069488 Accuracy: 98.40% 19 Validation loss: 0.074966 Best loss: 0.069488 Accuracy: 98.20% 20 Validation loss: 0.071082 Best loss: 0.069488 Accuracy: 98.12% 21 Validation loss: 0.070138 Best loss: 0.069488 Accuracy: 98.20% 22 Validation loss: 0.066032 Best loss: 0.066032 Accuracy: 98.28% 23 Validation loss: 0.061130 Best loss: 0.061130 Accuracy: 98.36% 24 Validation loss: 0.067107 Best loss: 0.061130 Accuracy: 98.16% 25 Validation loss: 0.071372 Best loss: 0.061130 Accuracy: 98.16% 26 Validation loss: 0.068535 Best loss: 0.061130 Accuracy: 98.36% 27 Validation loss: 0.065336 Best loss: 0.061130 Accuracy: 98.48% 28 Validation loss: 0.066783 Best loss: 0.061130 Accuracy: 98.40% 29 Validation loss: 0.092769 Best loss: 0.061130 Accuracy: 97.77% 30 Validation loss: 0.075746 Best loss: 0.061130 Accuracy: 98.01% 31 Validation loss: 0.084024 Best loss: 0.061130 Accuracy: 97.81% 32 Validation loss: 0.116428 Best loss: 0.061130 Accuracy: 98.44% 33 Validation loss: 0.079498 Best loss: 0.061130 Accuracy: 97.89% 34 Validation loss: 0.078189 Best loss: 0.061130 Accuracy: 97.97% 35 Validation loss: 0.083723 Best loss: 0.061130 Accuracy: 97.81% 36 Validation loss: 0.088210 Best loss: 0.061130 Accuracy: 97.19% 37 Validation loss: 0.080040 Best loss: 0.061130 Accuracy: 97.93% 38 Validation loss: 0.086932 Best loss: 0.061130 Accuracy: 97.89% 39 Validation loss: 0.240580 Best loss: 0.061130 Accuracy: 91.67% 40 Validation loss: 0.166662 Best loss: 0.061130 Accuracy: 94.29% 41 Validation loss: 0.125562 Best loss: 0.061130 Accuracy: 97.15% 42 Validation loss: 0.124890 Best loss: 0.061130 Accuracy: 95.82% 43 Validation loss: 0.127020 Best loss: 0.061130 Accuracy: 96.76% 44 Validation loss: 0.121540 Best loss: 0.061130 Accuracy: 96.05% Early stopping!
DNNClassifier(activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9b2368d08>, batch_norm_momentum=None, batch_size=500, dropout_rate=0.5, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=5, n_neurons=90, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The best params are reached during epoch 23. Dropout somewhat slowed down convergence.
Let's check the accuracy:
y_pred = dnn_clf_dropout.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.98657326328079398
We are out of luck, dropout does not seem to help either. Let's try tuning the hyperparameters, perhaps we can squeeze a bit more performance out of this model:
from sklearn.model_selection import RandomizedSearchCV
param_distribs = {
"n_neurons": [10, 30, 50, 70, 90, 100, 120, 140, 160],
"batch_size": [10, 50, 100, 500],
"learning_rate": [0.01, 0.02, 0.05, 0.1],
"activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
# you could also try exploring different numbers of hidden layers, different optimizers, etc.
#"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
"dropout_rate": [0.2, 0.3, 0.4, 0.5, 0.6],
}
rnd_search_dropout = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
fit_params={"X_valid": X_valid1, "y_valid": y_valid1, "n_epochs": 1000},
random_state=42, verbose=2)
rnd_search_dropout.fit(X_train1, y_train1)
Fitting 3 folds for each of 50 candidates, totalling 150 fits [CV] dropout_rate=0.5, n_neurons=70, learning_rate=0.01, activation=<function relu at 0x7fd9e8a660d0>, batch_size=100 0 Validation loss: 0.355079 Best loss: 0.355079 Accuracy: 91.44% 1 Validation loss: 0.280624 Best loss: 0.280624 Accuracy: 94.10% 2 Validation loss: 0.279819 Best loss: 0.279819 Accuracy: 92.77% 3 Validation loss: 0.223614 Best loss: 0.223614 Accuracy: 94.10% 4 Validation loss: 0.199802 Best loss: 0.199802 Accuracy: 95.11% 5 Validation loss: 0.214481 Best loss: 0.199802 Accuracy: 95.47% 6 Validation loss: 0.216195 Best loss: 0.199802 Accuracy: 95.78% 7 Validation loss: 0.209172 Best loss: 0.199802 Accuracy: 94.80% 8 Validation loss: 0.182841 Best loss: 0.182841 Accuracy: 95.70% 9 Validation loss: 0.214252 Best loss: 0.182841 Accuracy: 95.82% 10 Validation loss: 0.198762 Best loss: 0.182841 Accuracy: 95.62% 11 Validation loss: 0.186415 Best loss: 0.182841 Accuracy: 95.82% 12 Validation loss: 0.222924 Best loss: 0.182841 Accuracy: 96.05% 13 Validation loss: 0.199636 Best loss: 0.182841 Accuracy: 95.97% 14 Validation loss: 0.214436 Best loss: 0.182841 Accuracy: 95.97% 15 Validation loss: 0.213507 Best loss: 0.182841 Accuracy: 95.47% 16 Validation loss: 0.191497 Best loss: 0.182841 Accuracy: 95.78% 17 Validation loss: 0.179503 Best loss: 0.179503 Accuracy: 95.93% 18 Validation loss: 0.210343 Best loss: 0.179503 Accuracy: 95.74% 19 Validation loss: 0.212626 Best loss: 0.179503 Accuracy: 95.27% 20 Validation loss: 0.187110 Best loss: 0.179503 Accuracy: 96.09% 21 Validation loss: 0.175171 Best loss: 0.175171 Accuracy: 95.78% 22 Validation loss: 0.217172 Best loss: 0.175171 Accuracy: 95.66% 23 Validation loss: 0.181060 Best loss: 0.175171 Accuracy: 96.44% 24 Validation loss: 0.163630 Best loss: 0.163630 Accuracy: 95.93% 25 Validation loss: 0.225873 Best loss: 0.163630 Accuracy: 95.58% 26 Validation loss: 0.204975 Best loss: 0.163630 Accuracy: 95.66% 27 Validation loss: 0.183588 Best loss: 0.163630 Accuracy: 95.97% 28 Validation loss: 0.231080 Best loss: 0.163630 Accuracy: 95.11% 29 Validation loss: 0.204342 Best loss: 0.163630 Accuracy: 95.74% 30 Validation loss: 0.183963 Best loss: 0.163630 Accuracy: 95.93% 31 Validation loss: 0.200975 Best loss: 0.163630 Accuracy: 95.23% 32 Validation loss: 0.211165 Best loss: 0.163630 Accuracy: 95.23% 33 Validation loss: 0.217777 Best loss: 0.163630 Accuracy: 95.07% 34 Validation loss: 0.193184 Best loss: 0.163630 Accuracy: 95.39% 35 Validation loss: 0.203809 Best loss: 0.163630 Accuracy: 95.58% 36 Validation loss: 0.221673 Best loss: 0.163630 Accuracy: 94.57% 37 Validation loss: 0.215750 Best loss: 0.163630 Accuracy: 95.39% 38 Validation loss: 0.189653 Best loss: 0.163630 Accuracy: 96.09% 39 Validation loss: 0.191333 Best loss: 0.163630 Accuracy: 95.19% 40 Validation loss: 0.207714 Best loss: 0.163630 Accuracy: 96.01% 41 Validation loss: 0.174490 Best loss: 0.163630 Accuracy: 95.39% 42 Validation loss: 0.177445 Best loss: 0.163630 Accuracy: 95.82% 43 Validation loss: 0.166708 Best loss: 0.163630 Accuracy: 96.09% 44 Validation loss: 0.190829 Best loss: 0.163630 Accuracy: 95.70% 45 Validation loss: 0.225985 Best loss: 0.163630 Accuracy: 96.25% Early stopping! [CV] dropout_rate=0.5, n_neurons=70, learning_rate=0.01, activation=<function relu at 0x7fd9e8a660d0>, batch_size=100, total= 39.0s
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 39.1s remaining: 0.0s
[CV] dropout_rate=0.5, n_neurons=70, learning_rate=0.01, activation=<function relu at 0x7fd9e8a660d0>, batch_size=100 0 Validation loss: 0.748480 Best loss: 0.748480 Accuracy: 57.70% 1 Validation loss: 0.516088 Best loss: 0.516088 Accuracy: 78.50% 2 Validation loss: 0.448866 Best loss: 0.448866 Accuracy: 78.89% 3 Validation loss: 0.435606 Best loss: 0.435606 Accuracy: 78.54% 4 Validation loss: 0.435243 Best loss: 0.435243 Accuracy: 79.40% 5 Validation loss: 0.450605 Best loss: 0.435243 Accuracy: 78.42% 6 Validation loss: 0.430706 Best loss: 0.430706 Accuracy: 78.62% 7 Validation loss: 0.449289 Best loss: 0.430706 Accuracy: 78.30% 8 Validation loss: 0.413226 Best loss: 0.413226 Accuracy: 79.05% 9 Validation loss: 0.436053 Best loss: 0.413226 Accuracy: 78.46% 10 Validation loss: 0.459932 Best loss: 0.413226 Accuracy: 79.24% 11 Validation loss: 0.424138 Best loss: 0.413226 Accuracy: 79.24% 12 Validation loss: 0.409538 Best loss: 0.409538 Accuracy: 79.55% 13 Validation loss: 0.416324 Best loss: 0.409538 Accuracy: 75.41% 14 Validation loss: 0.440273 Best loss: 0.409538 Accuracy: 78.46% 15 Validation loss: 0.435736 Best loss: 0.409538 Accuracy: 79.05% 16 Validation loss: 0.428412 Best loss: 0.409538 Accuracy: 79.20% 17 Validation loss: 0.450156 Best loss: 0.409538 Accuracy: 80.02% 18 Validation loss: 0.421057 Best loss: 0.409538 Accuracy: 79.24% 19 Validation loss: 0.442284 Best loss: 0.409538 Accuracy: 79.01% 20 Validation loss: 0.426907 Best loss: 0.409538 Accuracy: 79.16% 21 Validation loss: 0.439567 Best loss: 0.409538 Accuracy: 79.05% 22 Validation loss: 0.452601 Best loss: 0.409538 Accuracy: 79.67% 23 Validation loss: 0.424887 Best loss: 0.409538 Accuracy: 79.09% 24 Validation loss: 0.441096 Best loss: 0.409538 Accuracy: 78.97% 25 Validation loss: 0.417390 Best loss: 0.409538 Accuracy: 78.89% 26 Validation loss: 0.418550 Best loss: 0.409538 Accuracy: 79.05% 27 Validation loss: 0.426065 Best loss: 0.409538 Accuracy: 78.66% 28 Validation loss: 0.413968 Best loss: 0.409538 Accuracy: 79.36% 29 Validation loss: 0.425434 Best loss: 0.409538 Accuracy: 79.24% 30 Validation loss: 0.455391 Best loss: 0.409538 Accuracy: 74.71% 31 Validation loss: 0.429498 Best loss: 0.409538 Accuracy: 79.20% 32 Validation loss: 0.427383 Best loss: 0.409538 Accuracy: 79.52% 33 Validation loss: 0.422621 Best loss: 0.409538 Accuracy: 78.62% Early stopping! [CV] dropout_rate=0.5, n_neurons=70, learning_rate=0.01, activation=<function relu at 0x7fd9e8a660d0>, batch_size=100, total= 27.4s [CV] dropout_rate=0.5, n_neurons=70, learning_rate=0.01, activation=<function relu at 0x7fd9e8a660d0>, batch_size=100 0 Validation loss: 0.497714 Best loss: 0.497714 Accuracy: 86.71% 1 Validation loss: 0.248258 Best loss: 0.248258 Accuracy: 93.51% 2 Validation loss: 0.279785 Best loss: 0.248258 Accuracy: 93.71% 3 Validation loss: 0.248663 Best loss: 0.248258 Accuracy: 94.61% 4 Validation loss: 0.269139 Best loss: 0.248258 Accuracy: 94.76% 5 Validation loss: 0.188808 Best loss: 0.188808 Accuracy: 95.39% 6 Validation loss: 0.196049 Best loss: 0.188808 Accuracy: 95.58% 7 Validation loss: 0.204966 Best loss: 0.188808 Accuracy: 95.15% 8 Validation loss: 0.238414 Best loss: 0.188808 Accuracy: 94.61% 9 Validation loss: 0.192095 Best loss: 0.188808 Accuracy: 95.97% [...and much later...] 19 Validation loss: 1.939112 Best loss: 1.619874 Accuracy: 22.01% 20 Validation loss: 1.825761 Best loss: 1.619874 Accuracy: 19.27% 21 Validation loss: 1.732937 Best loss: 1.619874 Accuracy: 22.01% 22 Validation loss: 1.832995 Best loss: 1.619874 Accuracy: 20.91% 23 Validation loss: 1.659557 Best loss: 1.619874 Accuracy: 20.91% 24 Validation loss: 1.828380 Best loss: 1.619874 Accuracy: 18.73% 25 Validation loss: 1.719589 Best loss: 1.619874 Accuracy: 22.01% 26 Validation loss: 1.842429 Best loss: 1.619874 Accuracy: 18.73% 27 Validation loss: 1.717596 Best loss: 1.619874 Accuracy: 19.27% 28 Validation loss: 1.863441 Best loss: 1.619874 Accuracy: 19.08% 29 Validation loss: 1.952335 Best loss: 1.619874 Accuracy: 19.08% 30 Validation loss: 1.853776 Best loss: 1.619874 Accuracy: 20.91% 31 Validation loss: 1.894134 Best loss: 1.619874 Accuracy: 22.01% 32 Validation loss: 1.711688 Best loss: 1.619874 Accuracy: 19.08% 33 Validation loss: 1.651240 Best loss: 1.619874 Accuracy: 18.73% 34 Validation loss: 1.760639 Best loss: 1.619874 Accuracy: 20.91% 35 Validation loss: 1.667938 Best loss: 1.619874 Accuracy: 22.01% 36 Validation loss: 1.641116 Best loss: 1.619874 Accuracy: 20.91% 37 Validation loss: 1.694960 Best loss: 1.619874 Accuracy: 19.08% 38 Validation loss: 1.816517 Best loss: 1.619874 Accuracy: 18.73% 39 Validation loss: 1.647246 Best loss: 1.619874 Accuracy: 18.73% Early stopping! [CV] dropout_rate=0.5, n_neurons=140, learning_rate=0.05, activation=<function elu at 0x7fd9e8a620d0>, batch_size=100, total= 1.0min
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 130.6min finished
0 Validation loss: 0.165751 Best loss: 0.165751 Accuracy: 95.47% 1 Validation loss: 0.111834 Best loss: 0.111834 Accuracy: 96.99% 2 Validation loss: 0.102867 Best loss: 0.102867 Accuracy: 96.83% 3 Validation loss: 0.089197 Best loss: 0.089197 Accuracy: 97.85% 4 Validation loss: 0.093953 Best loss: 0.089197 Accuracy: 97.77% 5 Validation loss: 0.079498 Best loss: 0.079498 Accuracy: 98.08% 6 Validation loss: 0.081214 Best loss: 0.079498 Accuracy: 98.01% 7 Validation loss: 0.086096 Best loss: 0.079498 Accuracy: 97.77% 8 Validation loss: 0.074422 Best loss: 0.074422 Accuracy: 97.73% 9 Validation loss: 0.079650 Best loss: 0.074422 Accuracy: 97.89% 10 Validation loss: 0.077278 Best loss: 0.074422 Accuracy: 97.77% 11 Validation loss: 0.077608 Best loss: 0.074422 Accuracy: 98.24% 12 Validation loss: 0.074337 Best loss: 0.074337 Accuracy: 98.05% 13 Validation loss: 0.066028 Best loss: 0.066028 Accuracy: 98.28% 14 Validation loss: 0.072845 Best loss: 0.066028 Accuracy: 98.16% 15 Validation loss: 0.066652 Best loss: 0.066028 Accuracy: 98.05% 16 Validation loss: 0.065729 Best loss: 0.065729 Accuracy: 98.16% 17 Validation loss: 0.061191 Best loss: 0.061191 Accuracy: 98.51% 18 Validation loss: 0.062528 Best loss: 0.061191 Accuracy: 98.44% 19 Validation loss: 0.065407 Best loss: 0.061191 Accuracy: 98.36% 20 Validation loss: 0.065273 Best loss: 0.061191 Accuracy: 98.44% 21 Validation loss: 0.061035 Best loss: 0.061035 Accuracy: 98.40% 22 Validation loss: 0.056312 Best loss: 0.056312 Accuracy: 98.59% 23 Validation loss: 0.069074 Best loss: 0.056312 Accuracy: 98.40% 24 Validation loss: 0.057482 Best loss: 0.056312 Accuracy: 98.51% 25 Validation loss: 0.068342 Best loss: 0.056312 Accuracy: 98.44% 26 Validation loss: 0.063494 Best loss: 0.056312 Accuracy: 98.48% 27 Validation loss: 0.057257 Best loss: 0.056312 Accuracy: 98.51% 28 Validation loss: 0.058659 Best loss: 0.056312 Accuracy: 98.59% 29 Validation loss: 0.059009 Best loss: 0.056312 Accuracy: 98.48% 30 Validation loss: 0.058227 Best loss: 0.056312 Accuracy: 98.55% 31 Validation loss: 0.062198 Best loss: 0.056312 Accuracy: 98.44% 32 Validation loss: 0.058043 Best loss: 0.056312 Accuracy: 98.40% 33 Validation loss: 0.055970 Best loss: 0.055970 Accuracy: 98.51% 34 Validation loss: 0.060111 Best loss: 0.055970 Accuracy: 98.67% 35 Validation loss: 0.058786 Best loss: 0.055970 Accuracy: 98.44% 36 Validation loss: 0.059944 Best loss: 0.055970 Accuracy: 98.32% 37 Validation loss: 0.058087 Best loss: 0.055970 Accuracy: 98.63% 38 Validation loss: 0.063003 Best loss: 0.055970 Accuracy: 98.36% 39 Validation loss: 0.052073 Best loss: 0.052073 Accuracy: 98.67% 40 Validation loss: 0.058115 Best loss: 0.052073 Accuracy: 98.40% 41 Validation loss: 0.059997 Best loss: 0.052073 Accuracy: 98.63% 42 Validation loss: 0.052416 Best loss: 0.052073 Accuracy: 98.75% 43 Validation loss: 0.053840 Best loss: 0.052073 Accuracy: 98.59% 44 Validation loss: 0.054563 Best loss: 0.052073 Accuracy: 98.67% 45 Validation loss: 0.049410 Best loss: 0.049410 Accuracy: 98.55% 46 Validation loss: 0.057060 Best loss: 0.049410 Accuracy: 98.24% 47 Validation loss: 0.062434 Best loss: 0.049410 Accuracy: 98.48% 48 Validation loss: 0.054523 Best loss: 0.049410 Accuracy: 98.59% 49 Validation loss: 0.052774 Best loss: 0.049410 Accuracy: 98.36% 50 Validation loss: 0.056562 Best loss: 0.049410 Accuracy: 98.32% 51 Validation loss: 0.060280 Best loss: 0.049410 Accuracy: 98.51% 52 Validation loss: 0.055685 Best loss: 0.049410 Accuracy: 98.55% 53 Validation loss: 0.056077 Best loss: 0.049410 Accuracy: 98.44% 54 Validation loss: 0.057951 Best loss: 0.049410 Accuracy: 98.44% 55 Validation loss: 0.056315 Best loss: 0.049410 Accuracy: 98.75% 56 Validation loss: 0.055744 Best loss: 0.049410 Accuracy: 98.55% 57 Validation loss: 0.054228 Best loss: 0.049410 Accuracy: 98.48% 58 Validation loss: 0.057836 Best loss: 0.049410 Accuracy: 98.71% 59 Validation loss: 0.053361 Best loss: 0.049410 Accuracy: 98.71% 60 Validation loss: 0.056389 Best loss: 0.049410 Accuracy: 98.48% 61 Validation loss: 0.061350 Best loss: 0.049410 Accuracy: 98.48% 62 Validation loss: 0.052135 Best loss: 0.049410 Accuracy: 98.67% 63 Validation loss: 0.053853 Best loss: 0.049410 Accuracy: 98.48% 64 Validation loss: 0.056641 Best loss: 0.049410 Accuracy: 98.71% 65 Validation loss: 0.052790 Best loss: 0.049410 Accuracy: 98.63% 66 Validation loss: 0.053514 Best loss: 0.049410 Accuracy: 98.44% Early stopping!
RandomizedSearchCV(cv=None, error_score='raise', estimator=DNNClassifier(activation=<function elu at 0x7fd9e8a620d0>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42), fit_params={'y_valid': array([0, 4, ..., 1, 2], dtype=uint8), 'X_valid': array([[ 0., 0., ..., 0., 0.], [ 0., 0., ..., 0., 0.], ..., [ 0., 0., ..., 0., 0.], [ 0., 0., ..., 0., 0.]], dtype=float32), 'n_epochs': 1000}, iid=True, n_iter=50, n_jobs=1, param_distributions={'dropout_rate': [0.2, 0.3, 0.4, 0.5, 0.6], 'n_neurons': [10, 30, 50, 70, 90, 100, 120, 140, 160], 'learning_rate': [0.01, 0.02, 0.05, 0.1], 'activation': [<function relu at 0x7fd9e8a660d0>, <function elu at 0x7fd9e8a620d0>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9b2368950>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x7fd9b23687b8>], 'batch_size': [10, 50, 100, 500]}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score=True, scoring=None, verbose=2)
rnd_search_dropout.best_params_
{'activation': <function __main__.leaky_relu.<locals>.parametrized_leaky_relu>, 'batch_size': 500, 'dropout_rate': 0.4, 'learning_rate': 0.01, 'n_neurons': 50}
y_pred = rnd_search_dropout.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.98812998637867289
Oh well, dropout did not improve the model. Better luck next time! :)
But that's okay, we have ourselves a nice DNN that achieves 99.40% accuracy on the test set using Batch Normalization, or 99.32% without BN. Let's see if some of this expertise on digits 0 to 4 can be transferred to the task of classifying digits 5 to 9. For the sake of simplicity we will reuse the DNN without BN, since it is almost as good.
Exercise: create a new DNN that reuses all the pretrained hidden layers of the previous model, freezes them, and replaces the softmax output layer with a new one.
Let's load the best model's graph and get a handle on all the important operations we will need. Note that instead of creating a new softmax output layer, we will just reuse the existing one (since it has the same number of outputs as the existing one). We will reinitialize its parameters before training.
reset_graph()
restore_saver = tf.train.import_meta_graph("./my_best_mnist_model_0_to_4.meta")
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
loss = tf.get_default_graph().get_tensor_by_name("loss:0")
Y_proba = tf.get_default_graph().get_tensor_by_name("Y_proba:0")
logits = Y_proba.op.inputs[0]
accuracy = tf.get_default_graph().get_tensor_by_name("accuracy:0")
To freeze the lower layers, we will exclude their variables from the optimizer's list of trainable variables, keeping only the output layer's trainable variables:
learning_rate = 0.01
output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
five_frozen_saver = tf.train.Saver()
Exercise: train this new DNN on digits 5 to 9, using only 100 images per digit, and time how long it takes. Despite this small number of examples, can you achieve high precision?
Let's create the training, validation and test sets. We need to subtract 5 from the labels because TensorFlow expects integers from 0 to n_classes-1
.
X_train2_full = mnist.train.images[mnist.train.labels >= 5]
y_train2_full = mnist.train.labels[mnist.train.labels >= 5] - 5
X_valid2_full = mnist.validation.images[mnist.validation.labels >= 5]
y_valid2_full = mnist.validation.labels[mnist.validation.labels >= 5] - 5
X_test2 = mnist.test.images[mnist.test.labels >= 5]
y_test2 = mnist.test.labels[mnist.test.labels >= 5] - 5
Also, for the purpose of this exercise, we want to keep only 100 instances per class in the training set (and let's keep only 30 instances per class in the validation set). Let's create a small function to do that:
def sample_n_instances_per_class(X, y, n=100):
Xs, ys = [], []
for label in np.unique(y):
idx = (y == label)
Xc = X[idx][:n]
yc = y[idx][:n]
Xs.append(Xc)
ys.append(yc)
return np.concatenate(Xs), np.concatenate(ys)
X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)
X_valid2, y_valid2 = sample_n_instances_per_class(X_valid2_full, y_valid2_full, n=30)
Now let's train the model. This is the same training code as earlier, using early stopping, except for the initialization: we first initialize all the variables, then we restore the best model trained earlier (on digits 0 to 4), and finally we reinitialize the output layer variables.
import time
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_best_mnist_model_0_to_4")
for var in output_layer_vars:
var.initializer.run()
t0 = time.time()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = five_frozen_saver.save(sess, "./my_mnist_model_5_to_9_five_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
t1 = time.time()
print("Total training time: {:.1f}s".format(t1 - t0))
with tf.Session() as sess:
five_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_five_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_0_to_4 0 Validation loss: 0.967851 Best loss: 0.967851 Accuracy: 67.33% 1 Validation loss: 0.861747 Best loss: 0.861747 Accuracy: 71.33% 2 Validation loss: 0.777535 Best loss: 0.777535 Accuracy: 72.00% 3 Validation loss: 0.699915 Best loss: 0.699915 Accuracy: 75.33% 4 Validation loss: 0.786714 Best loss: 0.699915 Accuracy: 78.00% 5 Validation loss: 0.735406 Best loss: 0.699915 Accuracy: 76.67% 6 Validation loss: 0.732264 Best loss: 0.699915 Accuracy: 78.00% 7 Validation loss: 0.691741 Best loss: 0.691741 Accuracy: 76.00% 8 Validation loss: 0.672757 Best loss: 0.672757 Accuracy: 80.00% 9 Validation loss: 0.666520 Best loss: 0.666520 Accuracy: 80.00% 10 Validation loss: 0.639375 Best loss: 0.639375 Accuracy: 81.33% 11 Validation loss: 0.645089 Best loss: 0.639375 Accuracy: 82.00% 12 Validation loss: 0.646768 Best loss: 0.639375 Accuracy: 80.00% 13 Validation loss: 0.623784 Best loss: 0.623784 Accuracy: 82.67% 14 Validation loss: 0.663026 Best loss: 0.623784 Accuracy: 80.00% 15 Validation loss: 0.704513 Best loss: 0.623784 Accuracy: 79.33% 16 Validation loss: 0.684003 Best loss: 0.623784 Accuracy: 79.33% 17 Validation loss: 0.658575 Best loss: 0.623784 Accuracy: 82.67% 18 Validation loss: 0.669875 Best loss: 0.623784 Accuracy: 79.33% 19 Validation loss: 0.664581 Best loss: 0.623784 Accuracy: 78.67% 20 Validation loss: 0.653490 Best loss: 0.623784 Accuracy: 80.00% 21 Validation loss: 0.707304 Best loss: 0.623784 Accuracy: 79.33% 22 Validation loss: 0.706012 Best loss: 0.623784 Accuracy: 80.67% 23 Validation loss: 0.681227 Best loss: 0.623784 Accuracy: 78.67% 24 Validation loss: 0.786823 Best loss: 0.623784 Accuracy: 78.00% 25 Validation loss: 0.686110 Best loss: 0.623784 Accuracy: 79.33% 26 Validation loss: 0.675166 Best loss: 0.623784 Accuracy: 82.67% 27 Validation loss: 0.667711 Best loss: 0.623784 Accuracy: 82.67% 28 Validation loss: 0.612220 Best loss: 0.612220 Accuracy: 83.33% 29 Validation loss: 0.701196 Best loss: 0.612220 Accuracy: 78.00% 30 Validation loss: 0.687806 Best loss: 0.612220 Accuracy: 81.33% 31 Validation loss: 0.776596 Best loss: 0.612220 Accuracy: 79.33% 32 Validation loss: 0.674172 Best loss: 0.612220 Accuracy: 80.00% 33 Validation loss: 0.719044 Best loss: 0.612220 Accuracy: 83.33% 34 Validation loss: 0.856403 Best loss: 0.612220 Accuracy: 74.00% 35 Validation loss: 0.744627 Best loss: 0.612220 Accuracy: 80.00% 36 Validation loss: 0.779348 Best loss: 0.612220 Accuracy: 78.00% 37 Validation loss: 0.763777 Best loss: 0.612220 Accuracy: 78.00% 38 Validation loss: 0.727376 Best loss: 0.612220 Accuracy: 78.00% 39 Validation loss: 0.823514 Best loss: 0.612220 Accuracy: 78.00% 40 Validation loss: 0.725053 Best loss: 0.612220 Accuracy: 80.67% 41 Validation loss: 0.678497 Best loss: 0.612220 Accuracy: 80.67% 42 Validation loss: 0.709977 Best loss: 0.612220 Accuracy: 80.67% 43 Validation loss: 0.737200 Best loss: 0.612220 Accuracy: 77.33% 44 Validation loss: 0.757937 Best loss: 0.612220 Accuracy: 77.33% 45 Validation loss: 0.732024 Best loss: 0.612220 Accuracy: 80.00% 46 Validation loss: 0.756428 Best loss: 0.612220 Accuracy: 80.67% 47 Validation loss: 0.757610 Best loss: 0.612220 Accuracy: 78.67% 48 Validation loss: 0.844137 Best loss: 0.612220 Accuracy: 80.00% Early stopping! Total training time: 2.3s INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_five_frozen Final test accuracy: 76.30%
Well that's not a great accuracy, is it? Of course with such a tiny training set, and with only one layer to tweak, we should not expect miracles.
Exercise: try caching the frozen layers, and train the model again: how much faster is it now?
Let's start by getting a handle on the output of the last frozen layer:
hidden5_out = tf.get_default_graph().get_tensor_by_name("hidden5_out:0")
Now let's train the model using roughly the same code as earlier. The difference is that we compute the output of the top frozen layer at the beginning (both for the training set and the validation set), and we cache it. This makes training roughly 1.5 to 3 times faster in this example (this may vary greatly, depending on your system):
import time
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_best_mnist_model_0_to_4")
for var in output_layer_vars:
var.initializer.run()
t0 = time.time()
hidden5_train = hidden5_out.eval(feed_dict={X: X_train2, y: y_train2})
hidden5_valid = hidden5_out.eval(feed_dict={X: X_valid2, y: y_valid2})
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
h5_batch, y_batch = hidden5_train[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={hidden5_out: h5_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={hidden5_out: hidden5_valid, y: y_valid2})
if loss_val < best_loss:
save_path = five_frozen_saver.save(sess, "./my_mnist_model_5_to_9_five_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
t1 = time.time()
print("Total training time: {:.1f}s".format(t1 - t0))
with tf.Session() as sess:
five_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_five_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_0_to_4 0 Validation loss: 1.109053 Best loss: 1.109053 Accuracy: 60.67% 1 Validation loss: 0.813156 Best loss: 0.813156 Accuracy: 72.00% 2 Validation loss: 0.755930 Best loss: 0.755930 Accuracy: 76.67% 3 Validation loss: 0.744004 Best loss: 0.744004 Accuracy: 74.67% 4 Validation loss: 0.685080 Best loss: 0.685080 Accuracy: 78.00% 5 Validation loss: 0.702316 Best loss: 0.685080 Accuracy: 78.00% 6 Validation loss: 0.646487 Best loss: 0.646487 Accuracy: 80.00% 7 Validation loss: 0.686437 Best loss: 0.646487 Accuracy: 79.33% 8 Validation loss: 0.750047 Best loss: 0.646487 Accuracy: 79.33% 9 Validation loss: 0.688554 Best loss: 0.646487 Accuracy: 79.33% 10 Validation loss: 0.785184 Best loss: 0.646487 Accuracy: 78.67% 11 Validation loss: 0.634506 Best loss: 0.634506 Accuracy: 80.67% 12 Validation loss: 0.656797 Best loss: 0.634506 Accuracy: 81.33% 13 Validation loss: 0.645497 Best loss: 0.634506 Accuracy: 81.33% 14 Validation loss: 0.618038 Best loss: 0.618038 Accuracy: 83.33% 15 Validation loss: 0.641752 Best loss: 0.618038 Accuracy: 78.67% 16 Validation loss: 0.645671 Best loss: 0.618038 Accuracy: 80.67% 17 Validation loss: 0.654640 Best loss: 0.618038 Accuracy: 82.00% 18 Validation loss: 0.670569 Best loss: 0.618038 Accuracy: 79.33% 19 Validation loss: 0.670985 Best loss: 0.618038 Accuracy: 82.00% 20 Validation loss: 0.659538 Best loss: 0.618038 Accuracy: 82.67% 21 Validation loss: 0.622648 Best loss: 0.618038 Accuracy: 83.33% 22 Validation loss: 0.736155 Best loss: 0.618038 Accuracy: 79.33% 23 Validation loss: 0.739367 Best loss: 0.618038 Accuracy: 76.67% 24 Validation loss: 0.699710 Best loss: 0.618038 Accuracy: 78.00% 25 Validation loss: 0.709630 Best loss: 0.618038 Accuracy: 81.33% 26 Validation loss: 0.692474 Best loss: 0.618038 Accuracy: 79.33% 27 Validation loss: 0.807931 Best loss: 0.618038 Accuracy: 77.33% 28 Validation loss: 0.676134 Best loss: 0.618038 Accuracy: 82.00% 29 Validation loss: 0.738905 Best loss: 0.618038 Accuracy: 79.33% 30 Validation loss: 0.664826 Best loss: 0.618038 Accuracy: 81.33% 31 Validation loss: 0.694714 Best loss: 0.618038 Accuracy: 80.00% 32 Validation loss: 0.739238 Best loss: 0.618038 Accuracy: 83.33% 33 Validation loss: 0.697210 Best loss: 0.618038 Accuracy: 80.00% 34 Validation loss: 0.817373 Best loss: 0.618038 Accuracy: 79.33% Early stopping! Total training time: 0.9s INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_five_frozen Final test accuracy: 76.51%
Exercise: try again reusing just four hidden layers instead of five. Can you achieve a higher precision?
Let's load the best model again, but this time we will create a new softmax output layer on top of the 4th hidden layer:
reset_graph()
n_outputs = 5
restore_saver = tf.train.import_meta_graph("./my_best_mnist_model_0_to_4.meta")
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
hidden4_out = tf.get_default_graph().get_tensor_by_name("hidden4_out:0")
logits = tf.layers.dense(hidden4_out, n_outputs, kernel_initializer=he_init, name="new_logits")
Y_proba = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
And now let's create the training operation. We want to freeze all the layers except for the new output layer:
learning_rate = 0.01
output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="new_logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)
init = tf.global_variables_initializer()
four_frozen_saver = tf.train.Saver()
And once again we train the model with the same code as earlier. Note: we could of course write a function once and use it multiple times, rather than copying almost the same training code over and over again, but as we keep tweaking the code slightly, the function would need multiple arguments and if
statements, and it would have to be at the beginning of the notebook, where it would not make much sense to readers. In short it would be very confusing, so we're better off with copy & paste.
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_best_mnist_model_0_to_4")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = four_frozen_saver.save(sess, "./my_mnist_model_5_to_9_four_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
four_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_four_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_0_to_4 0 Validation loss: 0.923460 Best loss: 0.923460 Accuracy: 69.33% 1 Validation loss: 0.796192 Best loss: 0.796192 Accuracy: 77.33% 2 Validation loss: 0.812068 Best loss: 0.796192 Accuracy: 78.67% 3 Validation loss: 0.697938 Best loss: 0.697938 Accuracy: 80.67% 4 Validation loss: 0.877122 Best loss: 0.697938 Accuracy: 74.67% 5 Validation loss: 0.708524 Best loss: 0.697938 Accuracy: 81.33% 6 Validation loss: 0.689500 Best loss: 0.689500 Accuracy: 84.00% 7 Validation loss: 0.758315 Best loss: 0.689500 Accuracy: 81.33% 8 Validation loss: 0.711138 Best loss: 0.689500 Accuracy: 78.67% 9 Validation loss: 0.687304 Best loss: 0.687304 Accuracy: 81.33% 10 Validation loss: 0.639222 Best loss: 0.639222 Accuracy: 81.33% 11 Validation loss: 0.716750 Best loss: 0.639222 Accuracy: 82.67% 12 Validation loss: 0.693442 Best loss: 0.639222 Accuracy: 80.67% 13 Validation loss: 0.727682 Best loss: 0.639222 Accuracy: 84.00% 14 Validation loss: 0.637289 Best loss: 0.637289 Accuracy: 84.67% 15 Validation loss: 0.741304 Best loss: 0.637289 Accuracy: 83.33% 16 Validation loss: 0.651895 Best loss: 0.637289 Accuracy: 82.67% 17 Validation loss: 0.641192 Best loss: 0.637289 Accuracy: 80.67% 18 Validation loss: 0.690386 Best loss: 0.637289 Accuracy: 80.67% 19 Validation loss: 0.648541 Best loss: 0.637289 Accuracy: 82.67% 20 Validation loss: 0.779663 Best loss: 0.637289 Accuracy: 83.33% 21 Validation loss: 0.768834 Best loss: 0.637289 Accuracy: 82.67% 22 Validation loss: 0.706279 Best loss: 0.637289 Accuracy: 82.67% 23 Validation loss: 0.745840 Best loss: 0.637289 Accuracy: 82.00% 24 Validation loss: 0.740068 Best loss: 0.637289 Accuracy: 83.33% 25 Validation loss: 0.604927 Best loss: 0.604927 Accuracy: 84.67% 26 Validation loss: 0.635410 Best loss: 0.604927 Accuracy: 82.00% 27 Validation loss: 0.776003 Best loss: 0.604927 Accuracy: 82.67% 28 Validation loss: 0.621502 Best loss: 0.604927 Accuracy: 82.00% 29 Validation loss: 0.695963 Best loss: 0.604927 Accuracy: 83.33% 30 Validation loss: 0.668194 Best loss: 0.604927 Accuracy: 84.67% 31 Validation loss: 0.768975 Best loss: 0.604927 Accuracy: 82.67% 32 Validation loss: 0.594731 Best loss: 0.594731 Accuracy: 84.00% 33 Validation loss: 0.665088 Best loss: 0.594731 Accuracy: 84.00% 34 Validation loss: 0.716284 Best loss: 0.594731 Accuracy: 81.33% 35 Validation loss: 0.782680 Best loss: 0.594731 Accuracy: 84.00% 36 Validation loss: 0.816441 Best loss: 0.594731 Accuracy: 84.00% 37 Validation loss: 0.749341 Best loss: 0.594731 Accuracy: 84.00% 38 Validation loss: 0.728754 Best loss: 0.594731 Accuracy: 82.00% 39 Validation loss: 0.838166 Best loss: 0.594731 Accuracy: 84.00% 40 Validation loss: 0.714871 Best loss: 0.594731 Accuracy: 84.00% 41 Validation loss: 0.765463 Best loss: 0.594731 Accuracy: 84.67% 42 Validation loss: 0.744043 Best loss: 0.594731 Accuracy: 82.00% 43 Validation loss: 0.726922 Best loss: 0.594731 Accuracy: 83.33% 44 Validation loss: 0.641118 Best loss: 0.594731 Accuracy: 82.67% 45 Validation loss: 0.657861 Best loss: 0.594731 Accuracy: 84.00% 46 Validation loss: 0.803642 Best loss: 0.594731 Accuracy: 86.00% 47 Validation loss: 0.754644 Best loss: 0.594731 Accuracy: 84.67% 48 Validation loss: 0.865141 Best loss: 0.594731 Accuracy: 84.00% 49 Validation loss: 0.709169 Best loss: 0.594731 Accuracy: 84.67% 50 Validation loss: 0.723139 Best loss: 0.594731 Accuracy: 84.00% 51 Validation loss: 0.745109 Best loss: 0.594731 Accuracy: 84.67% 52 Validation loss: 0.803908 Best loss: 0.594731 Accuracy: 82.67% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_four_frozen Final test accuracy: 80.17%
Still not fantastic, but much better.
Exercise: now unfreeze the top two hidden layers and continue training: can you get the model to perform even better?
learning_rate = 0.01
unfrozen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="hidden[34]|new_logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam3")
training_op = optimizer.minimize(loss, var_list=unfrozen_vars)
init = tf.global_variables_initializer()
two_frozen_saver = tf.train.Saver()
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
four_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_four_frozen")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = two_frozen_saver.save(sess, "./my_mnist_model_5_to_9_two_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
two_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_two_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_four_frozen 0 Validation loss: 0.880485 Best loss: 0.880485 Accuracy: 86.00% 1 Validation loss: 1.388974 Best loss: 0.880485 Accuracy: 81.33% 2 Validation loss: 0.741543 Best loss: 0.741543 Accuracy: 86.67% 3 Validation loss: 1.030772 Best loss: 0.741543 Accuracy: 84.00% 4 Validation loss: 0.699438 Best loss: 0.699438 Accuracy: 87.33% 5 Validation loss: 0.743930 Best loss: 0.699438 Accuracy: 89.33% 6 Validation loss: 1.711346 Best loss: 0.699438 Accuracy: 82.67% 7 Validation loss: 1.437762 Best loss: 0.699438 Accuracy: 82.00% 8 Validation loss: 0.829231 Best loss: 0.699438 Accuracy: 86.67% 9 Validation loss: 1.033920 Best loss: 0.699438 Accuracy: 86.67% 10 Validation loss: 1.055709 Best loss: 0.699438 Accuracy: 87.33% 11 Validation loss: 0.971796 Best loss: 0.699438 Accuracy: 88.00% 12 Validation loss: 0.801815 Best loss: 0.699438 Accuracy: 86.00% 13 Validation loss: 0.726146 Best loss: 0.699438 Accuracy: 89.33% 14 Validation loss: 0.757217 Best loss: 0.699438 Accuracy: 88.67% 15 Validation loss: 0.791842 Best loss: 0.699438 Accuracy: 90.00% 16 Validation loss: 0.732507 Best loss: 0.699438 Accuracy: 90.67% 17 Validation loss: 0.737297 Best loss: 0.699438 Accuracy: 90.67% 18 Validation loss: 0.746715 Best loss: 0.699438 Accuracy: 90.00% 19 Validation loss: 0.747751 Best loss: 0.699438 Accuracy: 90.00% 20 Validation loss: 0.749325 Best loss: 0.699438 Accuracy: 90.00% 21 Validation loss: 0.751899 Best loss: 0.699438 Accuracy: 90.00% 22 Validation loss: 0.754314 Best loss: 0.699438 Accuracy: 90.00% 23 Validation loss: 0.757840 Best loss: 0.699438 Accuracy: 90.00% 24 Validation loss: 0.761543 Best loss: 0.699438 Accuracy: 90.00% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_two_frozen Final test accuracy: 84.37%
Let's check what accuracy we can get by unfreezing all layers:
learning_rate = 0.01
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam4")
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
no_frozen_saver = tf.train.Saver()
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
two_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_two_frozen")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = no_frozen_saver.save(sess, "./my_mnist_model_5_to_9_no_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
no_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_no_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_two_frozen 0 Validation loss: 0.846005 Best loss: 0.846005 Accuracy: 83.33% 1 Validation loss: 0.694439 Best loss: 0.694439 Accuracy: 91.33% 2 Validation loss: 1.201433 Best loss: 0.694439 Accuracy: 85.33% 3 Validation loss: 1.975297 Best loss: 0.694439 Accuracy: 85.33% 4 Validation loss: 0.692805 Best loss: 0.692805 Accuracy: 95.33% 5 Validation loss: 1.090217 Best loss: 0.692805 Accuracy: 91.33% 6 Validation loss: 1.924300 Best loss: 0.692805 Accuracy: 90.67% 7 Validation loss: 4.019310 Best loss: 0.692805 Accuracy: 87.33% 8 Validation loss: 4.150792 Best loss: 0.692805 Accuracy: 78.00% 9 Validation loss: 4.522708 Best loss: 0.692805 Accuracy: 75.33% 10 Validation loss: 1.163385 Best loss: 0.692805 Accuracy: 90.00% 11 Validation loss: 0.655868 Best loss: 0.655868 Accuracy: 92.67% 12 Validation loss: 0.943888 Best loss: 0.655868 Accuracy: 92.67% 13 Validation loss: 0.529996 Best loss: 0.529996 Accuracy: 92.67% 14 Validation loss: 0.610578 Best loss: 0.529996 Accuracy: 94.67% 15 Validation loss: 3.899716 Best loss: 0.529996 Accuracy: 88.00% 16 Validation loss: 18.285717 Best loss: 0.529996 Accuracy: 86.67% 17 Validation loss: 23.169626 Best loss: 0.529996 Accuracy: 78.00% 18 Validation loss: 17.309252 Best loss: 0.529996 Accuracy: 90.00% 19 Validation loss: 44.261902 Best loss: 0.529996 Accuracy: 80.00% 20 Validation loss: 52.460327 Best loss: 0.529996 Accuracy: 80.00% 21 Validation loss: 26.318949 Best loss: 0.529996 Accuracy: 83.33% 22 Validation loss: 32.857723 Best loss: 0.529996 Accuracy: 90.67% 23 Validation loss: 53.359497 Best loss: 0.529996 Accuracy: 88.00% 24 Validation loss: 57.823742 Best loss: 0.529996 Accuracy: 88.00% 25 Validation loss: 37.154972 Best loss: 0.529996 Accuracy: 92.67% 26 Validation loss: 41.386772 Best loss: 0.529996 Accuracy: 90.00% 27 Validation loss: 43.486767 Best loss: 0.529996 Accuracy: 90.00% 28 Validation loss: 42.776855 Best loss: 0.529996 Accuracy: 88.67% 29 Validation loss: 43.368839 Best loss: 0.529996 Accuracy: 90.67% 30 Validation loss: 43.440975 Best loss: 0.529996 Accuracy: 90.00% 31 Validation loss: 42.889927 Best loss: 0.529996 Accuracy: 91.33% 32 Validation loss: 42.806690 Best loss: 0.529996 Accuracy: 90.67% 33 Validation loss: 42.784145 Best loss: 0.529996 Accuracy: 90.67% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_no_frozen Final test accuracy: 90.60%
Let's compare that to a DNN trained from scratch:
dnn_clf_5_to_9 = DNNClassifier(n_hidden_layers=4, random_state=42)
dnn_clf_5_to_9.fit(X_train2, y_train2, n_epochs=1000, X_valid=X_valid2, y_valid=y_valid2)
0 Validation loss: 0.803557 Best loss: 0.803557 Accuracy: 71.33% 1 Validation loss: 0.966741 Best loss: 0.803557 Accuracy: 85.33% 2 Validation loss: 1.158972 Best loss: 0.803557 Accuracy: 78.00% 3 Validation loss: 0.615960 Best loss: 0.615960 Accuracy: 88.00% 4 Validation loss: 0.612626 Best loss: 0.612626 Accuracy: 92.00% 5 Validation loss: 0.686420 Best loss: 0.612626 Accuracy: 89.33% 6 Validation loss: 0.805281 Best loss: 0.612626 Accuracy: 89.33% 7 Validation loss: 0.753108 Best loss: 0.612626 Accuracy: 88.67% 8 Validation loss: 1.051471 Best loss: 0.612626 Accuracy: 86.00% 9 Validation loss: 0.487089 Best loss: 0.487089 Accuracy: 93.33% 10 Validation loss: 1.191093 Best loss: 0.487089 Accuracy: 85.33% 11 Validation loss: 0.878905 Best loss: 0.487089 Accuracy: 88.67% 12 Validation loss: 0.768841 Best loss: 0.487089 Accuracy: 91.33% 13 Validation loss: 1.153907 Best loss: 0.487089 Accuracy: 90.67% 14 Validation loss: 0.985427 Best loss: 0.487089 Accuracy: 89.33% 15 Validation loss: 1.221879 Best loss: 0.487089 Accuracy: 85.33% 16 Validation loss: 0.961743 Best loss: 0.487089 Accuracy: 88.67% 17 Validation loss: 3.116057 Best loss: 0.487089 Accuracy: 84.00% 18 Validation loss: 0.686387 Best loss: 0.487089 Accuracy: 84.00% 19 Validation loss: 0.929801 Best loss: 0.487089 Accuracy: 88.00% 20 Validation loss: 1.137579 Best loss: 0.487089 Accuracy: 92.00% 21 Validation loss: 0.987261 Best loss: 0.487089 Accuracy: 91.33% 22 Validation loss: 2.030677 Best loss: 0.487089 Accuracy: 91.33% 23 Validation loss: 1.094184 Best loss: 0.487089 Accuracy: 92.00% 24 Validation loss: 1.332256 Best loss: 0.487089 Accuracy: 82.67% 25 Validation loss: 1.128633 Best loss: 0.487089 Accuracy: 85.33% 26 Validation loss: 0.866569 Best loss: 0.487089 Accuracy: 90.67% 27 Validation loss: 1.088500 Best loss: 0.487089 Accuracy: 89.33% 28 Validation loss: 1.146113 Best loss: 0.487089 Accuracy: 89.33% 29 Validation loss: 1.163180 Best loss: 0.487089 Accuracy: 89.33% 30 Validation loss: 1.154797 Best loss: 0.487089 Accuracy: 89.33% Early stopping!
DNNClassifier(activation=<function elu at 0x7fd9e8a620d0>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<function variance_scaling_initializer.<locals>._initializer at 0x7fd9d5e628c8>, learning_rate=0.01, n_hidden_layers=4, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
y_pred = dnn_clf_5_to_9.predict(X_test2)
accuracy_score(y_test2, y_pred)
0.90413495165603786
Meh. How disappointing! ;) Transfer learning did not help much (if at all) in this task. At least we tried... Fortunately, the next exercise will get better results.
In this exercise you will build a DNN that compares two MNIST digit images and predicts whether they represent the same digit or not. Then you will reuse the lower layers of this network to train an MNIST classifier using very little training data.
Exercise: Start by building two DNNs (let's call them DNN A and B), both similar to the one you built earlier but without the output layer: each DNN should have five hidden layers of 100 neurons each, He initialization, and ELU activation. Next, add one more hidden layer with 10 units on top of both DNNs. You should use TensorFlow's concat()
function with axis=1
to concatenate the outputs of both DNNs along the horizontal axis, then feed the result to the hidden layer. Finally, add an output layer with a single neuron using the logistic activation function.
Warning! There was an error in the book for this exercise: there was no instruction to add a top hidden layer. Without it, the neural network generally fails to start learning. If you have the latest version of the book, this error has been fixed.
You could have two input placeholders, X1
and X2
, one for the images that should be fed to the first DNN, and the other for the images that should be fed to the second DNN. It would work fine. However, another option is to have a single input placeholder to hold both sets of images (each row will hold a pair of images), and use tf.unstack()
to split this tensor into two separate tensors, like this:
n_inputs = 28 * 28 # MNIST
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, 2, n_inputs), name="X")
X1, X2 = tf.unstack(X, axis=1)
We also need the labels placeholder. Each label will be 0 if the images represent different digits, or 1 if they represent the same digit:
y = tf.placeholder(tf.int32, shape=[None, 1])
Now let's feed these inputs through two separate DNNs:
dnn1 = dnn(X1, name="DNN_A")
dnn2 = dnn(X2, name="DNN_B")
And let's concatenate their outputs:
dnn_outputs = tf.concat([dnn1, dnn2], axis=1)
Each DNN outputs 100 activations (per instance), so the shape is [None, 100]
:
dnn1.shape
TensorShape([Dimension(None), Dimension(100)])
dnn2.shape
TensorShape([Dimension(None), Dimension(100)])
And of course the concatenated outputs have a shape of [None, 200]
:
dnn_outputs.shape
TensorShape([Dimension(None), Dimension(200)])
Now lets add an extra hidden layer with just 10 neurons, and the output layer, with a single neuron:
hidden = tf.layers.dense(dnn_outputs, units=10, activation=tf.nn.elu, kernel_initializer=he_init)
logits = tf.layers.dense(hidden, units=1, kernel_initializer=he_init)
y_proba = tf.nn.sigmoid(logits)
The whole network predicts 1
if y_proba >= 0.5
(i.e. the network predicts that the images represent the same digit), or 0
otherwise. We compute instead logits >= 0
, which is equivalent but faster to compute:
y_pred = tf.cast(tf.greater_equal(logits, 0), tf.int32)
Now let's add the cost function:
y_as_float = tf.cast(y, tf.float32)
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_as_float, logits=logits)
loss = tf.reduce_mean(xentropy)
And we can now create the training operation using an optimizer:
learning_rate = 0.01
momentum = 0.95
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss)
We will want to measure our classifier's accuracy.
y_pred_correct = tf.equal(y_pred, y)
accuracy = tf.reduce_mean(tf.cast(y_pred_correct, tf.float32))
And the usual init
and saver
:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Exercise: split the MNIST training set in two sets: split #1 should containing 55,000 images, and split #2 should contain contain 5,000 images. Create a function that generates a training batch where each instance is a pair of MNIST images picked from split #1. Half of the training instances should be pairs of images that belong to the same class, while the other half should be images from different classes. For each pair, the training label should be 0 if the images are from the same class, or 1 if they are from different classes.
The MNIST dataset returned by TensorFlow's input_data()
function is already split into 3 parts: a training set (55,000 instances), a validation set (5,000 instances) and a test set (10,000 instances). Let's use the first set to generate the training set composed image pairs, and we will use the second set for the second phase of the exercise (to train a regular MNIST classifier). We will use the third set as the test set for both phases.
X_train1 = mnist.train.images
y_train1 = mnist.train.labels
X_train2 = mnist.validation.images
y_train2 = mnist.validation.labels
X_test = mnist.test.images
y_test = mnist.test.labels
Let's write a function that generates pairs of images: 50% representing the same digit, and 50% representing different digits. There are many ways to implement this. In this implementation, we first decide how many "same" pairs (i.e. pairs of images representing the same digit) we will generate, and how many "different" pairs (i.e. pairs of images representing different digits). We could just use batch_size // 2
but we want to handle the case where it is odd (granted, that might be overkill!). Then we generate random pairs and we pick the right number of "same" pairs, then we generate the right number of "different" pairs. Finally we shuffle the batch and return it:
def generate_batch(images, labels, batch_size):
size1 = batch_size // 2
size2 = batch_size - size1
if size1 != size2 and np.random.rand() > 0.5:
size1, size2 = size2, size1
X = []
y = []
while len(X) < size1:
rnd_idx1, rnd_idx2 = np.random.randint(0, len(images), 2)
if rnd_idx1 != rnd_idx2 and labels[rnd_idx1] == labels[rnd_idx2]:
X.append(np.array([images[rnd_idx1], images[rnd_idx2]]))
y.append([1])
while len(X) < batch_size:
rnd_idx1, rnd_idx2 = np.random.randint(0, len(images), 2)
if labels[rnd_idx1] != labels[rnd_idx2]:
X.append(np.array([images[rnd_idx1], images[rnd_idx2]]))
y.append([0])
rnd_indices = np.random.permutation(batch_size)
return np.array(X)[rnd_indices], np.array(y)[rnd_indices]
Let's test it to generate a small batch of 5 image pairs:
batch_size = 5
X_batch, y_batch = generate_batch(X_train1, y_train1, batch_size)
Each row in X_batch
contains a pair of images:
X_batch.shape, X_batch.dtype
((5, 2, 784), dtype('float32'))
Let's look at these pairs:
plt.figure(figsize=(3, 3 * batch_size))
plt.subplot(121)
plt.imshow(X_batch[:,0].reshape(28 * batch_size, 28), cmap="binary", interpolation="nearest")
plt.axis('off')
plt.subplot(122)
plt.imshow(X_batch[:,1].reshape(28 * batch_size, 28), cmap="binary", interpolation="nearest")
plt.axis('off')
plt.show()
And let's look at the labels (0 means "different", 1 means "same"):
y_batch
array([[1], [0], [0], [1], [0]])
Perfect!
Exercise: train the DNN on this training set. For each image pair, you can simultaneously feed the first image to DNN A and the second image to DNN B. The whole network will gradually learn to tell whether two images belong to the same class or not.
Let's generate a test set composed of many pairs of images pulled from the MNIST test set:
X_test1, y_test1 = generate_batch(X_test, y_test, batch_size=len(X_test))
And now, let's train the model. There's really nothing special about this step, except for the fact that we need a fairly large batch_size
, otherwise the model fails to learn anything and ends up with an accuracy of 50%:
n_epochs = 100
batch_size = 500
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = generate_batch(X_train1, y_train1, batch_size)
loss_val, _ = sess.run([loss, training_op], feed_dict={X: X_batch, y: y_batch})
print(epoch, "Train loss:", loss_val)
if epoch % 5 == 0:
acc_test = accuracy.eval(feed_dict={X: X_test1, y: y_test1})
print(epoch, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_digit_comparison_model.ckpt")
0 Train loss: 0.492426 0 Test accuracy: 0.7861 1 Train loss: 0.334813 2 Train loss: 0.290434 3 Train loss: 0.253434 4 Train loss: 0.217843 5 Train loss: 0.17127 5 Test accuracy: 0.9185 6 Train loss: 0.207128 7 Train loss: 0.172275 8 Train loss: 0.166783 9 Train loss: 0.161094 10 Train loss: 0.125131 10 Test accuracy: 0.9425 11 Train loss: 0.159824 12 Train loss: 0.124752 13 Train loss: 0.112234 14 Train loss: 0.114502 15 Train loss: 0.0950093 15 Test accuracy: 0.9532 16 Train loss: 0.119296 17 Train loss: 0.0754429 18 Train loss: 0.112295 19 Train loss: 0.133708 20 Train loss: 0.113547 20 Test accuracy: 0.9596 21 Train loss: 0.0674082 22 Train loss: 0.0936297 23 Train loss: 0.0986469 24 Train loss: 0.111875 25 Train loss: 0.0735623 25 Test accuracy: 0.9675 26 Train loss: 0.0790324 27 Train loss: 0.0487644 28 Train loss: 0.0869071 29 Train loss: 0.0694422 30 Train loss: 0.060089 30 Test accuracy: 0.9663 31 Train loss: 0.103902 32 Train loss: 0.0535952 33 Train loss: 0.0310679 34 Train loss: 0.0536294 35 Train loss: 0.046265 35 Test accuracy: 0.9701 36 Train loss: 0.0679821 37 Train loss: 0.0326656 38 Train loss: 0.0357479 39 Train loss: 0.0333373 40 Train loss: 0.0415115 40 Test accuracy: 0.9719 41 Train loss: 0.0577977 42 Train loss: 0.0342781 43 Train loss: 0.0439651 44 Train loss: 0.0597254 45 Train loss: 0.0588695 45 Test accuracy: 0.9721 46 Train loss: 0.0556821 47 Train loss: 0.063956 48 Train loss: 0.0301285 49 Train loss: 0.0402678 50 Train loss: 0.0489125 50 Test accuracy: 0.9751 51 Train loss: 0.0394528 52 Train loss: 0.0233041 53 Train loss: 0.064878 54 Train loss: 0.0510189 55 Train loss: 0.0312619 55 Test accuracy: 0.9742 56 Train loss: 0.0244156 57 Train loss: 0.0409082 58 Train loss: 0.0346896 59 Train loss: 0.0455727 60 Train loss: 0.0488268 60 Test accuracy: 0.9751 61 Train loss: 0.0154253 62 Train loss: 0.0358874 63 Train loss: 0.0290555 64 Train loss: 0.0172143 65 Train loss: 0.0377991 65 Test accuracy: 0.9751 66 Train loss: 0.0360786 67 Train loss: 0.0240278 68 Train loss: 0.0314243 69 Train loss: 0.0412082 70 Train loss: 0.0439106 70 Test accuracy: 0.9763 71 Train loss: 0.0169656 72 Train loss: 0.0181306 73 Train loss: 0.0214228 74 Train loss: 0.0418301 75 Train loss: 0.0378622 75 Test accuracy: 0.9759 76 Train loss: 0.0199817 77 Train loss: 0.0145837 78 Train loss: 0.0199176 79 Train loss: 0.0226598 80 Train loss: 0.0119815 80 Test accuracy: 0.9779 81 Train loss: 0.0177832 82 Train loss: 0.00981572 83 Train loss: 0.0279094 84 Train loss: 0.0237818 85 Train loss: 0.0157778 85 Test accuracy: 0.978 86 Train loss: 0.00950592 87 Train loss: 0.0226222 88 Train loss: 0.0226599 89 Train loss: 0.0185005 90 Train loss: 0.0118967 90 Test accuracy: 0.976 91 Train loss: 0.0209059 92 Train loss: 0.0181153 93 Train loss: 0.0131697 94 Train loss: 0.017605 95 Train loss: 0.0193861 95 Test accuracy: 0.976 96 Train loss: 0.0156532 97 Train loss: 0.0136041 98 Train loss: 0.00743028 99 Train loss: 0.0267189
All right, we reach 97.6% accuracy on this digit comparison task. That's not too bad, this model knows a thing or two about comparing handwritten digits!
Let's see if some of that knowledge can be useful for the regular MNIST classification task.
Exercise: now create a new DNN by reusing and freezing the hidden layers of DNN A and adding a softmax output layer on top with 10 neurons. Train this network on split #2 and see if you can achieve high performance despite having only 500 images per class.
Let's create the model, it is pretty straightforward. There are many ways to freeze the lower layers, as explained in the book. In this example, we chose to use the tf.stop_gradient()
function. Note that we need one Saver
to restore the pretrained DNN A, and another Saver
to save the final model:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
dnn_outputs = dnn(X, name="DNN_A")
frozen_outputs = tf.stop_gradient(dnn_outputs)
logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init)
Y_proba = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
dnn_A_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="DNN_A")
restore_saver = tf.train.Saver(var_list={var.op.name: var for var in dnn_A_vars})
saver = tf.train.Saver()
Now on to training! We first initialize all variables (including the variables in the new output layer), then we restore the pretrained DNN A. Next, we just train the model on the small MNIST dataset (containing just 5,000 images):
n_epochs = 100
batch_size = 50
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_digit_comparison_model.ckpt")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if epoch % 10 == 0:
acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
print(epoch, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_mnist_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_digit_comparison_model.ckpt 0 Test accuracy: 0.9269 10 Test accuracy: 0.9675 20 Test accuracy: 0.9673 30 Test accuracy: 0.9673 40 Test accuracy: 0.9674 50 Test accuracy: 0.9673 60 Test accuracy: 0.9673 70 Test accuracy: 0.9673 80 Test accuracy: 0.9672 90 Test accuracy: 0.9673
Well, 96.7% accuracy, that's not the best MNIST model we have trained so far, but recall that we are only using a small training set (just 500 images per digit). Let's compare this result with the same DNN trained from scratch, without using transfer learning:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
dnn_outputs = dnn(X, name="DNN_A")
logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init)
Y_proba = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
dnn_A_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="DNN_A")
restore_saver = tf.train.Saver(var_list={var.op.name: var for var in dnn_A_vars})
saver = tf.train.Saver()
n_epochs = 150
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if epoch % 10 == 0:
acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
print(epoch, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_mnist_model_final.ckpt")
0 Test accuracy: 0.8893 10 Test accuracy: 0.9402 20 Test accuracy: 0.9479 30 Test accuracy: 0.9474 40 Test accuracy: 0.9479 50 Test accuracy: 0.9475 60 Test accuracy: 0.9475 70 Test accuracy: 0.9475 80 Test accuracy: 0.9476 90 Test accuracy: 0.9476 100 Test accuracy: 0.9473 110 Test accuracy: 0.9472 120 Test accuracy: 0.9474 130 Test accuracy: 0.9474 140 Test accuracy: 0.9475
Only 94.8% accuracy... So transfer learning helped us reduce the error rate from 5.2% to 3.3% (that's over 36% error reduction). Moreover, the model using transfer learning reached over 96% accuracy in less than 10 epochs.
Bottom line: transfer learning does not always work (as we saw in exercise 9), but when it does it can make a big difference. So try it out!