#!/usr/bin/env python # coding: utf-8 # **Chapter 14 – Recurrent Neural Networks** # _This notebook contains all the sample code and solutions to the exercises in chapter 14._ # # # #
# Run in Google Colab #
# **Warning**: this is the code for the 1st edition of the book. Please visit https://github.com/ageron/handson-ml2 for the 2nd edition code, with up-to-date notebooks using the latest library versions. In particular, the 1st edition is based on TensorFlow 1, while the 2nd edition uses TensorFlow 2, which is much simpler to use. # # Setup # First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures: # In[1]: # To support both python 2 and python 3 from __future__ import division, print_function, unicode_literals # Common imports import numpy as np import os try: # %tensorflow_version only exists in Colab. get_ipython().run_line_magic('tensorflow_version', '1.x') except Exception: pass # to make this notebook's output stable across runs def reset_graph(seed=42): tf.reset_default_graph() tf.set_random_seed(seed) np.random.seed(seed) # To plot pretty figures get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib import matplotlib.pyplot as plt plt.rcParams['axes.labelsize'] = 14 plt.rcParams['xtick.labelsize'] = 12 plt.rcParams['ytick.labelsize'] = 12 # Where to save the figures PROJECT_ROOT_DIR = "." CHAPTER_ID = "rnn" IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) os.makedirs(IMAGES_PATH, exist_ok=True) def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300): path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension) print("Saving figure", fig_id) if tight_layout: plt.tight_layout() plt.savefig(path, format=fig_extension, dpi=resolution) # Then of course we will need TensorFlow: # In[2]: import tensorflow as tf # # Basic RNNs # ## Manual RNN # In[3]: reset_graph() n_inputs = 3 n_neurons = 5 X0 = tf.placeholder(tf.float32, [None, n_inputs]) X1 = tf.placeholder(tf.float32, [None, n_inputs]) Wx = tf.Variable(tf.random_normal(shape=[n_inputs, n_neurons],dtype=tf.float32)) Wy = tf.Variable(tf.random_normal(shape=[n_neurons,n_neurons],dtype=tf.float32)) b = tf.Variable(tf.zeros([1, n_neurons], dtype=tf.float32)) Y0 = tf.tanh(tf.matmul(X0, Wx) + b) Y1 = tf.tanh(tf.matmul(Y0, Wy) + tf.matmul(X1, Wx) + b) init = tf.global_variables_initializer() # In[4]: import numpy as np X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) # t = 0 X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) # t = 1 with tf.Session() as sess: init.run() Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch}) # In[5]: print(Y0_val) # In[6]: print(Y1_val) # ## Using `static_rnn()` # **Note**: `tf.contrib.rnn` was partially moved to the core API in TensorFlow 1.2. Most of the `*Cell` and `*Wrapper` classes are now available in `tf.nn.rnn_cell`, and the `tf.contrib.rnn.static_rnn()` function is available as `tf.nn.static_rnn()`. # In[7]: n_inputs = 3 n_neurons = 5 # In[8]: reset_graph() X0 = tf.placeholder(tf.float32, [None, n_inputs]) X1 = tf.placeholder(tf.float32, [None, n_inputs]) basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) output_seqs, states = tf.nn.static_rnn(basic_cell, [X0, X1], dtype=tf.float32) Y0, Y1 = output_seqs # In[9]: init = tf.global_variables_initializer() # In[10]: X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) with tf.Session() as sess: init.run() Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch}) # In[11]: Y0_val # In[12]: Y1_val # In[13]: from datetime import datetime root_logdir = os.path.join(os.curdir, "tf_logs") def make_log_subdir(run_id=None): if run_id is None: run_id = datetime.utcnow().strftime("%Y%m%d%H%M%S") return "{}/run-{}/".format(root_logdir, run_id) def save_graph(graph=None, run_id=None): if graph is None: graph = tf.get_default_graph() logdir = make_log_subdir(run_id) file_writer = tf.summary.FileWriter(logdir, graph=graph) file_writer.close() return logdir # In[14]: save_graph() # In[15]: get_ipython().run_line_magic('load_ext', 'tensorboard') # In[16]: get_ipython().run_line_magic('tensorboard', '--logdir {root_logdir}') # ## Packing sequences # In[17]: n_steps = 2 n_inputs = 3 n_neurons = 5 # In[18]: reset_graph() X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) X_seqs = tf.unstack(tf.transpose(X, perm=[1, 0, 2])) basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) output_seqs, states = tf.nn.static_rnn(basic_cell, X_seqs, dtype=tf.float32) outputs = tf.transpose(tf.stack(output_seqs), perm=[1, 0, 2]) # In[19]: init = tf.global_variables_initializer() # In[20]: X_batch = np.array([ # t = 0 t = 1 [[0, 1, 2], [9, 8, 7]], # instance 1 [[3, 4, 5], [0, 0, 0]], # instance 2 [[6, 7, 8], [6, 5, 4]], # instance 3 [[9, 0, 1], [3, 2, 1]], # instance 4 ]) with tf.Session() as sess: init.run() outputs_val = outputs.eval(feed_dict={X: X_batch}) # In[21]: print(outputs_val) # In[22]: print(np.transpose(outputs_val, axes=[1, 0, 2])[1]) # ## Using `dynamic_rnn()` # In[23]: n_steps = 2 n_inputs = 3 n_neurons = 5 # In[24]: reset_graph() X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32) # In[25]: init = tf.global_variables_initializer() # In[26]: X_batch = np.array([ [[0, 1, 2], [9, 8, 7]], # instance 1 [[3, 4, 5], [0, 0, 0]], # instance 2 [[6, 7, 8], [6, 5, 4]], # instance 3 [[9, 0, 1], [3, 2, 1]], # instance 4 ]) with tf.Session() as sess: init.run() outputs_val = outputs.eval(feed_dict={X: X_batch}) # In[27]: print(outputs_val) # In[28]: save_graph() # In[29]: get_ipython().run_line_magic('tensorboard', '--logdir {root_logdir}') # ## Setting the sequence lengths # In[30]: n_steps = 2 n_inputs = 3 n_neurons = 5 reset_graph() X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) # In[31]: seq_length = tf.placeholder(tf.int32, [None]) outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32, sequence_length=seq_length) # In[32]: init = tf.global_variables_initializer() # In[33]: X_batch = np.array([ # step 0 step 1 [[0, 1, 2], [9, 8, 7]], # instance 1 [[3, 4, 5], [0, 0, 0]], # instance 2 (padded with zero vectors) [[6, 7, 8], [6, 5, 4]], # instance 3 [[9, 0, 1], [3, 2, 1]], # instance 4 ]) seq_length_batch = np.array([2, 1, 2, 2]) # In[34]: with tf.Session() as sess: init.run() outputs_val, states_val = sess.run( [outputs, states], feed_dict={X: X_batch, seq_length: seq_length_batch}) # In[35]: print(outputs_val) # In[36]: print(states_val) # ## Training a sequence classifier # Note: the book uses `tensorflow.contrib.layers.fully_connected()` rather than `tf.layers.dense()` (which did not exist when this chapter was written). It is now preferable to use `tf.layers.dense()`, because anything in the contrib module may change or be deleted without notice. The `dense()` function is almost identical to the `fully_connected()` function. The main differences relevant to this chapter are: # * several parameters are renamed: `scope` becomes `name`, `activation_fn` becomes `activation` (and similarly the `_fn` suffix is removed from other parameters such as `normalizer_fn`), `weights_initializer` becomes `kernel_initializer`, etc. # * the default `activation` is now `None` rather than `tf.nn.relu`. # In[37]: reset_graph() n_steps = 28 n_inputs = 28 n_neurons = 150 n_outputs = 10 learning_rate = 0.001 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.int32, [None]) basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32) logits = tf.layers.dense(states, n_outputs) xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.reduce_mean(xentropy) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) correct = tf.nn.in_top_k(logits, y, 1) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) init = tf.global_variables_initializer() # **Warning**: `tf.examples.tutorials.mnist` is deprecated. We will use `tf.keras.datasets.mnist` instead. # In[38]: (X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data() X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0 X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0 y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) X_valid, X_train = X_train[:5000], X_train[5000:] y_valid, y_train = y_train[:5000], y_train[5000:] # In[39]: def shuffle_batch(X, y, batch_size): rnd_idx = np.random.permutation(len(X)) n_batches = len(X) // batch_size for batch_idx in np.array_split(rnd_idx, n_batches): X_batch, y_batch = X[batch_idx], y[batch_idx] yield X_batch, y_batch # In[40]: X_test = X_test.reshape((-1, n_steps, n_inputs)) # In[41]: n_epochs = 100 batch_size = 150 with tf.Session() as sess: init.run() for epoch in range(n_epochs): for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size): X_batch = X_batch.reshape((-1, n_steps, n_inputs)) sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch}) acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test}) print(epoch, "Last batch accuracy:", acc_batch, "Test accuracy:", acc_test) # # Multi-layer RNN # In[42]: reset_graph() n_steps = 28 n_inputs = 28 n_outputs = 10 learning_rate = 0.001 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.int32, [None]) # In[43]: n_neurons = 100 n_layers = 3 layers = [tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu) for layer in range(n_layers)] multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell(layers) outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32) # In[44]: states_concat = tf.concat(axis=1, values=states) logits = tf.layers.dense(states_concat, n_outputs) xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.reduce_mean(xentropy) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) correct = tf.nn.in_top_k(logits, y, 1) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) init = tf.global_variables_initializer() # In[45]: n_epochs = 10 batch_size = 150 with tf.Session() as sess: init.run() for epoch in range(n_epochs): for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size): X_batch = X_batch.reshape((-1, n_steps, n_inputs)) sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch}) acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test}) print(epoch, "Last batch accuracy:", acc_batch, "Test accuracy:", acc_test) # # Time series # In[46]: t_min, t_max = 0, 30 resolution = 0.1 def time_series(t): return t * np.sin(t) / 3 + 2 * np.sin(t*5) def next_batch(batch_size, n_steps): t0 = np.random.rand(batch_size, 1) * (t_max - t_min - n_steps * resolution) Ts = t0 + np.arange(0., n_steps + 1) * resolution ys = time_series(Ts) return ys[:, :-1].reshape(-1, n_steps, 1), ys[:, 1:].reshape(-1, n_steps, 1) # In[47]: t = np.linspace(t_min, t_max, int((t_max - t_min) / resolution)) n_steps = 20 t_instance = np.linspace(12.2, 12.2 + resolution * (n_steps + 1), n_steps + 1) plt.figure(figsize=(11,4)) plt.subplot(121) plt.title("A time series (generated)", fontsize=14) plt.plot(t, time_series(t), label=r"$t . \sin(t) / 3 + 2 . \sin(5t)$") plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "b-", linewidth=3, label="A training instance") plt.legend(loc="lower left", fontsize=14) plt.axis([0, 30, -17, 13]) plt.xlabel("Time") plt.ylabel("Value") plt.subplot(122) plt.title("A training instance", fontsize=14) plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance") plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target") plt.legend(loc="upper left") plt.xlabel("Time") save_fig("time_series_plot") plt.show() # In[48]: X_batch, y_batch = next_batch(1, n_steps) # In[49]: np.c_[X_batch[0], y_batch[0]] # ## Using an `OuputProjectionWrapper` # Let's create the RNN. It will contain 100 recurrent neurons and we will unroll it over 20 time steps since each training instance will be 20 inputs long. Each input will contain only one feature (the value at that time). The targets are also sequences of 20 inputs, each containing a single value: # In[50]: reset_graph() n_steps = 20 n_inputs = 1 n_neurons = 100 n_outputs = 1 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.float32, [None, n_steps, n_outputs]) cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu) outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) # At each time step we now have an output vector of size 100. But what we actually want is a single output value at each time step. The simplest solution is to wrap the cell in an `OutputProjectionWrapper`. # In[51]: reset_graph() n_steps = 20 n_inputs = 1 n_neurons = 100 n_outputs = 1 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.float32, [None, n_steps, n_outputs]) # In[52]: cell = tf.contrib.rnn.OutputProjectionWrapper( tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu), output_size=n_outputs) # In[53]: outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) # In[54]: learning_rate = 0.001 loss = tf.reduce_mean(tf.square(outputs - y)) # MSE optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) init = tf.global_variables_initializer() # In[55]: saver = tf.train.Saver() # In[56]: n_iterations = 1500 batch_size = 50 with tf.Session() as sess: init.run() for iteration in range(n_iterations): X_batch, y_batch = next_batch(batch_size, n_steps) sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) if iteration % 100 == 0: mse = loss.eval(feed_dict={X: X_batch, y: y_batch}) print(iteration, "\tMSE:", mse) saver.save(sess, "./my_time_series_model") # not shown in the book # In[57]: with tf.Session() as sess: # not shown in the book saver.restore(sess, "./my_time_series_model") # not shown X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs))) y_pred = sess.run(outputs, feed_dict={X: X_new}) # In[58]: y_pred # In[59]: plt.title("Testing the model", fontsize=14) plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance") plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target") plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction") plt.legend(loc="upper left") plt.xlabel("Time") save_fig("time_series_pred_plot") plt.show() # ## Without using an `OutputProjectionWrapper` # In[60]: reset_graph() n_steps = 20 n_inputs = 1 n_neurons = 100 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.float32, [None, n_steps, n_outputs]) # In[61]: cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu) rnn_outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) # In[62]: n_outputs = 1 learning_rate = 0.001 # In[63]: stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons]) stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs) outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs]) # In[64]: loss = tf.reduce_mean(tf.square(outputs - y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) init = tf.global_variables_initializer() saver = tf.train.Saver() # In[65]: n_iterations = 1500 batch_size = 50 with tf.Session() as sess: init.run() for iteration in range(n_iterations): X_batch, y_batch = next_batch(batch_size, n_steps) sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) if iteration % 100 == 0: mse = loss.eval(feed_dict={X: X_batch, y: y_batch}) print(iteration, "\tMSE:", mse) X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs))) y_pred = sess.run(outputs, feed_dict={X: X_new}) saver.save(sess, "./my_time_series_model") # In[66]: y_pred # In[67]: plt.title("Testing the model", fontsize=14) plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance") plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target") plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction") plt.legend(loc="upper left") plt.xlabel("Time") plt.show() # ## Generating a creative new sequence # In[68]: with tf.Session() as sess: # not shown in the book saver.restore(sess, "./my_time_series_model") # not shown sequence = [0.] * n_steps for iteration in range(300): X_batch = np.array(sequence[-n_steps:]).reshape(1, n_steps, 1) y_pred = sess.run(outputs, feed_dict={X: X_batch}) sequence.append(y_pred[0, -1, 0]) # In[69]: plt.figure(figsize=(8,4)) plt.plot(np.arange(len(sequence)), sequence, "b-") plt.plot(t[:n_steps], sequence[:n_steps], "b-", linewidth=3) plt.xlabel("Time") plt.ylabel("Value") plt.show() # In[70]: with tf.Session() as sess: saver.restore(sess, "./my_time_series_model") sequence1 = [0. for i in range(n_steps)] for iteration in range(len(t) - n_steps): X_batch = np.array(sequence1[-n_steps:]).reshape(1, n_steps, 1) y_pred = sess.run(outputs, feed_dict={X: X_batch}) sequence1.append(y_pred[0, -1, 0]) sequence2 = [time_series(i * resolution + t_min + (t_max-t_min/3)) for i in range(n_steps)] for iteration in range(len(t) - n_steps): X_batch = np.array(sequence2[-n_steps:]).reshape(1, n_steps, 1) y_pred = sess.run(outputs, feed_dict={X: X_batch}) sequence2.append(y_pred[0, -1, 0]) plt.figure(figsize=(11,4)) plt.subplot(121) plt.plot(t, sequence1, "b-") plt.plot(t[:n_steps], sequence1[:n_steps], "b-", linewidth=3) plt.xlabel("Time") plt.ylabel("Value") plt.subplot(122) plt.plot(t, sequence2, "b-") plt.plot(t[:n_steps], sequence2[:n_steps], "b-", linewidth=3) plt.xlabel("Time") save_fig("creative_sequence_plot") plt.show() # # Deep RNN # ## MultiRNNCell # In[71]: reset_graph() n_inputs = 2 n_steps = 5 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) # In[72]: n_neurons = 100 n_layers = 3 layers = [tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) for layer in range(n_layers)] multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell(layers) outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32) # In[73]: init = tf.global_variables_initializer() # In[74]: X_batch = np.random.rand(2, n_steps, n_inputs) # In[75]: with tf.Session() as sess: init.run() outputs_val, states_val = sess.run([outputs, states], feed_dict={X: X_batch}) # In[76]: outputs_val.shape # ## Distributing a Deep RNN Across Multiple GPUs # Do **NOT** do this: # In[77]: with tf.device("/gpu:0"): # BAD! This is ignored. layer1 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) with tf.device("/gpu:1"): # BAD! Ignored again. layer2 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) # Instead, you need a `DeviceCellWrapper`: # In[78]: import tensorflow as tf class DeviceCellWrapper(tf.nn.rnn_cell.RNNCell): def __init__(self, device, cell): self._cell = cell self._device = device @property def state_size(self): return self._cell.state_size @property def output_size(self): return self._cell.output_size def __call__(self, inputs, state, scope=None): with tf.device(self._device): return self._cell(inputs, state, scope) # In[79]: reset_graph() n_inputs = 5 n_steps = 20 n_neurons = 100 X = tf.placeholder(tf.float32, shape=[None, n_steps, n_inputs]) # In[80]: devices = ["/cpu:0", "/cpu:0", "/cpu:0"] # replace with ["/gpu:0", "/gpu:1", "/gpu:2"] if you have 3 GPUs cells = [DeviceCellWrapper(dev,tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons)) for dev in devices] multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell(cells) outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32) # Alternatively, since TensorFlow 1.1, you can use the `tf.contrib.rnn.DeviceWrapper` class (alias `tf.nn.rnn_cell.DeviceWrapper` since TF 1.2). # In[81]: init = tf.global_variables_initializer() # In[82]: with tf.Session() as sess: init.run() print(sess.run(outputs, feed_dict={X: np.random.rand(2, n_steps, n_inputs)})) # ## Dropout # In[83]: reset_graph() n_inputs = 1 n_neurons = 100 n_layers = 3 n_steps = 20 n_outputs = 1 # In[84]: X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.float32, [None, n_steps, n_outputs]) # Note: the `input_keep_prob` parameter can be a placeholder, making it possible to set it to any value you want during training, and to 1.0 during testing (effectively turning dropout off). This is a much more elegant solution than what was recommended in earlier versions of the book (i.e., writing your own wrapper class or having a separate model for training and testing). Thanks to Shen Cheng for bringing this to my attention. # In[85]: keep_prob = tf.placeholder_with_default(1.0, shape=()) cells = [tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons) for layer in range(n_layers)] cells_drop = [tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=keep_prob) for cell in cells] multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell(cells_drop) rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32) # In[86]: learning_rate = 0.01 stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons]) stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs) outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs]) loss = tf.reduce_mean(tf.square(outputs - y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) init = tf.global_variables_initializer() saver = tf.train.Saver() # In[87]: n_iterations = 1500 batch_size = 50 train_keep_prob = 0.5 with tf.Session() as sess: init.run() for iteration in range(n_iterations): X_batch, y_batch = next_batch(batch_size, n_steps) _, mse = sess.run([training_op, loss], feed_dict={X: X_batch, y: y_batch, keep_prob: train_keep_prob}) if iteration % 100 == 0: # not shown in the book print(iteration, "Training MSE:", mse) # not shown saver.save(sess, "./my_dropout_time_series_model") # In[88]: with tf.Session() as sess: saver.restore(sess, "./my_dropout_time_series_model") X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs))) y_pred = sess.run(outputs, feed_dict={X: X_new}) # In[89]: plt.title("Testing the model", fontsize=14) plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance") plt.plot(t_instance[1:], time_series(t_instance[1:]), "w*", markersize=10, label="target") plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction") plt.legend(loc="upper left") plt.xlabel("Time") plt.show() # Oops, it seems that Dropout does not help at all in this particular case. :/ # # LSTM # In[90]: reset_graph() lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons) # In[91]: n_steps = 28 n_inputs = 28 n_neurons = 150 n_outputs = 10 n_layers = 3 learning_rate = 0.001 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.int32, [None]) lstm_cells = [tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons) for layer in range(n_layers)] multi_cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cells) outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32) top_layer_h_state = states[-1][1] logits = tf.layers.dense(top_layer_h_state, n_outputs, name="softmax") xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.reduce_mean(xentropy, name="loss") optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) correct = tf.nn.in_top_k(logits, y, 1) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) init = tf.global_variables_initializer() # In[92]: states # In[93]: top_layer_h_state # In[94]: n_epochs = 10 batch_size = 150 with tf.Session() as sess: init.run() for epoch in range(n_epochs): for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size): X_batch = X_batch.reshape((-1, n_steps, n_inputs)) sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch}) acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test}) print(epoch, "Last batch accuracy:", acc_batch, "Test accuracy:", acc_test) # In[95]: lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=n_neurons, use_peepholes=True) # In[96]: gru_cell = tf.nn.rnn_cell.GRUCell(num_units=n_neurons) # # Embeddings # This section is based on TensorFlow's [Word2Vec tutorial](https://www.tensorflow.org/versions/r0.11/tutorials/word2vec/index.html). # ## Fetch the data # In[97]: import urllib.request import errno import os import zipfile WORDS_PATH = "datasets/words" WORDS_URL = 'http://mattmahoney.net/dc/text8.zip' def mkdir_p(path): """Create directories, ok if they already exist. This is for python 2 support. In python >=3.2, simply use: >>> os.makedirs(path, exist_ok=True) """ try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH): os.makedirs(words_path, exist_ok=True) zip_path = os.path.join(words_path, "words.zip") if not os.path.exists(zip_path): urllib.request.urlretrieve(words_url, zip_path) with zipfile.ZipFile(zip_path) as f: data = f.read(f.namelist()[0]) return data.decode("ascii").split() # In[98]: words = fetch_words_data() # In[99]: words[:5] # ## Build the dictionary # In[100]: from collections import Counter vocabulary_size = 50000 vocabulary = [("UNK", None)] + Counter(words).most_common(vocabulary_size - 1) vocabulary = np.array([word for word, _ in vocabulary]) dictionary = {word: code for code, word in enumerate(vocabulary)} data = np.array([dictionary.get(word, 0) for word in words]) # In[101]: " ".join(words[:9]), data[:9] # In[102]: " ".join([vocabulary[word_index] for word_index in [5241, 3081, 12, 6, 195, 2, 3134, 46, 59]]) # In[103]: words[24], data[24] # ## Generate batches # In[104]: from collections import deque def generate_batch(batch_size, num_skips, skip_window): global data_index assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=[batch_size], dtype=np.int32) labels = np.ndarray(shape=[batch_size, 1], dtype=np.int32) span = 2 * skip_window + 1 # [ skip_window target skip_window ] buffer = deque(maxlen=span) for _ in range(span): buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) for i in range(batch_size // num_skips): target = skip_window # target label at the center of the buffer targets_to_avoid = [ skip_window ] for j in range(num_skips): while target in targets_to_avoid: target = np.random.randint(0, span) targets_to_avoid.append(target) batch[i * num_skips + j] = buffer[skip_window] labels[i * num_skips + j, 0] = buffer[target] buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) return batch, labels # In[105]: np.random.seed(42) # In[106]: data_index = 0 batch, labels = generate_batch(8, 2, 1) # In[107]: batch, [vocabulary[word] for word in batch] # In[108]: labels, [vocabulary[word] for word in labels[:, 0]] # ## Build the model # In[109]: batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. # We pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) num_sampled = 64 # Number of negative examples to sample. learning_rate = 0.01 # In[110]: reset_graph() # Input data. train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # In[111]: vocabulary_size = 50000 embedding_size = 150 # Look up embeddings for inputs. init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0) embeddings = tf.Variable(init_embeds) # In[112]: train_inputs = tf.placeholder(tf.int32, shape=[None]) embed = tf.nn.embedding_lookup(embeddings, train_inputs) # In[113]: # Construct the variables for the NCE loss nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / np.sqrt(embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. loss = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed, num_sampled, vocabulary_size)) # Construct the Adam optimizer optimizer = tf.train.AdamOptimizer(learning_rate) training_op = optimizer.minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keepdims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # Add variable initializer. init = tf.global_variables_initializer() # ## Train the model # In[114]: num_steps = 10001 with tf.Session() as session: init.run() average_loss = 0 for step in range(num_steps): print("\rIteration: {}".format(step), end="\t") batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window) feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} # We perform one update step by evaluating the training op (including it # in the list of returned values for session.run() _, loss_val = session.run([training_op, loss], feed_dict=feed_dict) average_loss += loss_val if step % 2000 == 0: if step > 0: average_loss /= 2000 # The average loss is an estimate of the loss over the last 2000 batches. print("Average loss at step ", step, ": ", average_loss) average_loss = 0 # Note that this is expensive (~20% slowdown if computed every 500 steps) if step % 10000 == 0: sim = similarity.eval() for i in range(valid_size): valid_word = vocabulary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k+1] log_str = "Nearest to %s:" % valid_word for k in range(top_k): close_word = vocabulary[nearest[k]] log_str = "%s %s," % (log_str, close_word) print(log_str) final_embeddings = normalized_embeddings.eval() # Let's save the final embeddings (of course you can use a TensorFlow `Saver` if you prefer): # In[115]: np.save("./my_final_embeddings.npy", final_embeddings) # ## Plot the embeddings # In[116]: def plot_with_labels(low_dim_embs, labels): assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" plt.figure(figsize=(18, 18)) #in inches for i, label in enumerate(labels): x, y = low_dim_embs[i,:] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') # In[117]: from sklearn.manifold import TSNE tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:]) labels = [vocabulary[i] for i in range(plot_only)] plot_with_labels(low_dim_embs, labels) # # Machine Translation # The `basic_rnn_seq2seq()` function creates a simple Encoder/Decoder model: it first runs an RNN to encode `encoder_inputs` into a state vector, then runs a decoder initialized with the last encoder state on `decoder_inputs`. Encoder and decoder use the same RNN cell type but they don't share parameters. # In[118]: import tensorflow as tf reset_graph() n_steps = 50 n_neurons = 200 n_layers = 3 num_encoder_symbols = 20000 num_decoder_symbols = 20000 embedding_size = 150 learning_rate = 0.01 X = tf.placeholder(tf.int32, [None, n_steps]) # English sentences Y = tf.placeholder(tf.int32, [None, n_steps]) # French translations W = tf.placeholder(tf.float32, [None, n_steps - 1, 1]) Y_input = Y[:, :-1] Y_target = Y[:, 1:] encoder_inputs = tf.unstack(tf.transpose(X)) # list of 1D tensors decoder_inputs = tf.unstack(tf.transpose(Y_input)) # list of 1D tensors lstm_cells = [tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons) for layer in range(n_layers)] cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cells) output_seqs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size) logits = tf.transpose(tf.unstack(output_seqs), perm=[1, 0, 2]) # In[119]: logits_flat = tf.reshape(logits, [-1, num_decoder_symbols]) Y_target_flat = tf.reshape(Y_target, [-1]) W_flat = tf.reshape(W, [-1]) xentropy = W_flat * tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y_target_flat, logits=logits_flat) loss = tf.reduce_mean(xentropy) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) init = tf.global_variables_initializer() # # Exercise solutions # ## 1. to 6. # See Appendix A. # ## 7. Embedded Reber Grammars # First we need to build a function that generates strings based on a grammar. The grammar will be represented as a list of possible transitions for each state. A transition specifies the string to output (or a grammar to generate it) and the next state. # In[120]: np.random.seed(42) default_reber_grammar = [ [("B", 1)], # (state 0) =B=>(state 1) [("T", 2), ("P", 3)], # (state 1) =T=>(state 2) or =P=>(state 3) [("S", 2), ("X", 4)], # (state 2) =S=>(state 2) or =X=>(state 4) [("T", 3), ("V", 5)], # and so on... [("X", 3), ("S", 6)], [("P", 4), ("V", 6)], [("E", None)]] # (state 6) =E=>(terminal state) embedded_reber_grammar = [ [("B", 1)], [("T", 2), ("P", 3)], [(default_reber_grammar, 4)], [(default_reber_grammar, 5)], [("T", 6)], [("P", 6)], [("E", None)]] def generate_string(grammar): state = 0 output = [] while state is not None: index = np.random.randint(len(grammar[state])) production, state = grammar[state][index] if isinstance(production, list): production = generate_string(grammar=production) output.append(production) return "".join(output) # Let's generate a few strings based on the default Reber grammar: # In[121]: for _ in range(25): print(generate_string(default_reber_grammar), end=" ") # Looks good. Now let's generate a few strings based on the embedded Reber grammar: # In[122]: for _ in range(25): print(generate_string(embedded_reber_grammar), end=" ") # Okay, now we need a function to generate strings that do not respect the grammar. We could generate a random string, but the task would be a bit too easy, so instead we will generate a string that respects the grammar, and we will corrupt it by changing just one character: # In[123]: def generate_corrupted_string(grammar, chars="BEPSTVX"): good_string = generate_string(grammar) index = np.random.randint(len(good_string)) good_char = good_string[index] bad_char = np.random.choice(sorted(set(chars) - set(good_char))) return good_string[:index] + bad_char + good_string[index + 1:] # Let's look at a few corrupted strings: # In[124]: for _ in range(25): print(generate_corrupted_string(embedded_reber_grammar), end=" ") # It's not possible to feed a string directly to an RNN: we need to convert it to a sequence of vectors, first. Each vector will represent a single letter, using a one-hot encoding. For example, the letter "B" will be represented as the vector `[1, 0, 0, 0, 0, 0, 0]`, the letter E will be represented as `[0, 1, 0, 0, 0, 0, 0]` and so on. Let's write a function that converts a string to a sequence of such one-hot vectors. Note that if the string is shorted than `n_steps`, it will be padded with zero vectors (later, we will tell TensorFlow how long each string actually is using the `sequence_length` parameter). # In[125]: def string_to_one_hot_vectors(string, n_steps, chars="BEPSTVX"): char_to_index = {char: index for index, char in enumerate(chars)} output = np.zeros((n_steps, len(chars)), dtype=np.int32) for index, char in enumerate(string): output[index, char_to_index[char]] = 1. return output # In[126]: string_to_one_hot_vectors("BTBTXSETE", 12) # We can now generate the dataset, with 50% good strings, and 50% bad strings: # In[127]: def generate_dataset(size): good_strings = [generate_string(embedded_reber_grammar) for _ in range(size // 2)] bad_strings = [generate_corrupted_string(embedded_reber_grammar) for _ in range(size - size // 2)] all_strings = good_strings + bad_strings n_steps = max([len(string) for string in all_strings]) X = np.array([string_to_one_hot_vectors(string, n_steps) for string in all_strings]) seq_length = np.array([len(string) for string in all_strings]) y = np.array([[1] for _ in range(len(good_strings))] + [[0] for _ in range(len(bad_strings))]) rnd_idx = np.random.permutation(size) return X[rnd_idx], seq_length[rnd_idx], y[rnd_idx] # In[128]: X_train, l_train, y_train = generate_dataset(10000) # Let's take a look at the first training instances: # In[129]: X_train[0] # It's padded with a lot of zeros because the longest string in the dataset is that long. How long is this particular string? # In[130]: l_train[0] # What class is it? # In[131]: y_train[0] # Perfect! We are ready to create the RNN to identify good strings. We build a sequence classifier very similar to the one we built earlier to classify MNIST images, with two main differences: # * First, the input strings have variable length, so we need to specify the `sequence_length` when calling the `dynamic_rnn()` function. # * Second, this is a binary classifier, so we only need one output neuron that will output, for each input string, the estimated log probability that it is a good string. For multiclass classification, we used `sparse_softmax_cross_entropy_with_logits()` but for binary classification we use `sigmoid_cross_entropy_with_logits()`. # # In[132]: reset_graph() possible_chars = "BEPSTVX" n_inputs = len(possible_chars) n_neurons = 30 n_outputs = 1 learning_rate = 0.02 momentum = 0.95 X = tf.placeholder(tf.float32, [None, None, n_inputs], name="X") seq_length = tf.placeholder(tf.int32, [None], name="seq_length") y = tf.placeholder(tf.float32, [None, 1], name="y") gru_cell = tf.nn.rnn_cell.GRUCell(num_units=n_neurons) outputs, states = tf.nn.dynamic_rnn(gru_cell, X, dtype=tf.float32, sequence_length=seq_length) logits = tf.layers.dense(states, n_outputs, name="logits") y_pred = tf.cast(tf.greater(logits, 0.), tf.float32, name="y_pred") y_proba = tf.nn.sigmoid(logits, name="y_proba") xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.reduce_mean(xentropy, name="loss") optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True) training_op = optimizer.minimize(loss) correct = tf.equal(y_pred, y, name="correct") accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy") init = tf.global_variables_initializer() saver = tf.train.Saver() # Now let's generate a validation set so we can track progress during training: # In[133]: X_val, l_val, y_val = generate_dataset(5000) # In[134]: n_epochs = 50 batch_size = 50 with tf.Session() as sess: init.run() for epoch in range(n_epochs): X_batches = np.array_split(X_train, len(X_train) // batch_size) l_batches = np.array_split(l_train, len(l_train) // batch_size) y_batches = np.array_split(y_train, len(y_train) // batch_size) for X_batch, l_batch, y_batch in zip(X_batches, l_batches, y_batches): loss_val, _ = sess.run( [loss, training_op], feed_dict={X: X_batch, seq_length: l_batch, y: y_batch}) acc_train = accuracy.eval(feed_dict={X: X_batch, seq_length: l_batch, y: y_batch}) acc_val = accuracy.eval(feed_dict={X: X_val, seq_length: l_val, y: y_val}) print("{:4d} Train loss: {:.4f}, accuracy: {:.2f}% Validation accuracy: {:.2f}%".format( epoch, loss_val, 100 * acc_train, 100 * acc_val)) saver.save(sess, "./my_reber_classifier") # Now let's test our RNN on two tricky strings: the first one is bad while the second one is good. They only differ by the second to last character. If the RNN gets this right, it shows that it managed to notice the pattern that the second letter should always be equal to the second to last letter. That requires a fairly long short-term memory (which is the reason why we used a GRU cell). # In[135]: test_strings = [ "BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE", "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE"] l_test = np.array([len(s) for s in test_strings]) max_length = l_test.max() X_test = [string_to_one_hot_vectors(s, n_steps=max_length) for s in test_strings] with tf.Session() as sess: saver.restore(sess, "./my_reber_classifier") y_proba_val = y_proba.eval(feed_dict={X: X_test, seq_length: l_test}) print() print("Estimated probability that these are Reber strings:") for index, string in enumerate(test_strings): print("{}: {:.2f}%".format(string, 100 * y_proba_val[index][0])) # Ta-da! It worked fine. The RNN found the correct answers with high confidence. :) # ## 8. and 9. # Coming soon... # In[ ]: