#!/usr/bin/env python # coding: utf-8 # # Image Captioning with RNNs # In this exercise you will implement a vanilla recurrent neural networks and use them it to train a model that can generate novel captions for images. # In[1]: # As usual, a bit of setup from __future__ import print_function import time, os, json import numpy as np import matplotlib.pyplot as plt from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array from cs231n.rnn_layers import * from cs231n.captioning_solver import CaptioningSolver from cs231n.classifiers.rnn import CaptioningRNN from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions from cs231n.image_utils import image_from_url get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' # for auto-reloading external modules # see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') def rel_error(x, y): """ returns relative error """ return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y)))) # ## Install h5py # The COCO dataset we will be using is stored in HDF5 format. To load HDF5 files, we will need to install the `h5py` Python package. From the command line, run:
# `pip install h5py`
# If you receive a permissions error, you may need to run the command as root:
# ```sudo pip install h5py``` # # You can also run commands directly from the Jupyter notebook by prefixing the command with the "!" character: # In[2]: get_ipython().system('pip install h5py') # # Microsoft COCO # For this exercise we will use the 2014 release of the [Microsoft COCO dataset](http://mscoco.org/) which has become the standard testbed for image captioning. The dataset consists of 80,000 training images and 40,000 validation images, each annotated with 5 captions written by workers on Amazon Mechanical Turk. # # You should have already downloaded the data by changing to the `cs231n/datasets` directory and running the script `get_assignment3_data.sh`. If you haven't yet done so, run that script now. Warning: the COCO data download is ~1GB. # # We have preprocessed the data and extracted features for you already. For all images we have extracted features from the fc7 layer of the VGG-16 network pretrained on ImageNet; these features are stored in the files `train2014_vgg16_fc7.h5` and `val2014_vgg16_fc7.h5` respectively. To cut down on processing time and memory requirements, we have reduced the dimensionality of the features from 4096 to 512; these features can be found in the files `train2014_vgg16_fc7_pca.h5` and `val2014_vgg16_fc7_pca.h5`. # # The raw images take up a lot of space (nearly 20GB) so we have not included them in the download. However all images are taken from Flickr, and URLs of the training and validation images are stored in the files `train2014_urls.txt` and `val2014_urls.txt` respectively. This allows you to download images on the fly for visualization. Since images are downloaded on-the-fly, **you must be connected to the internet to view images**. # # Dealing with strings is inefficient, so we will work with an encoded version of the captions. Each word is assigned an integer ID, allowing us to represent a caption by a sequence of integers. The mapping between integer IDs and words is in the file `coco2014_vocab.json`, and you can use the function `decode_captions` from the file `cs231n/coco_utils.py` to convert numpy arrays of integer IDs back into strings. # # There are a couple special tokens that we add to the vocabulary. We prepend a special `` token and append an `` token to the beginning and end of each caption respectively. Rare words are replaced with a special `` token (for "unknown"). In addition, since we want to train with minibatches containing captions of different lengths, we pad short captions with a special `` token after the `` token and don't compute loss or gradient for `` tokens. Since they are a bit of a pain, we have taken care of all implementation details around special tokens for you. # # You can load all of the MS-COCO data (captions, features, URLs, and vocabulary) using the `load_coco_data` function from the file `cs231n/coco_utils.py`. Run the following cell to do so: # In[4]: # Load COCO data from disk; this returns a dictionary # We'll work with dimensionality-reduced features for this notebook, but feel # free to experiment with the original features by changing the flag below. data = load_coco_data(pca_features=True) # Print out all the keys and values from the data dictionary for k, v in data.items(): if type(v) == np.ndarray: print(k, type(v), v.shape, v.dtype) else: print(k, type(v), len(v)) # ## Look at the data # It is always a good idea to look at examples from the dataset before working with it. # # You can use the `sample_coco_minibatch` function from the file `cs231n/coco_utils.py` to sample minibatches of data from the data structure returned from `load_coco_data`. Run the following to sample a small minibatch of training data and show the images and their captions. Running it multiple times and looking at the results helps you to get a sense of the dataset. # # Note that we decode the captions using the `decode_captions` function and that we download the images on-the-fly using their Flickr URL, so **you must be connected to the internet to view images**. # In[5]: # Sample a minibatch and show the images and captions batch_size = 3 captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size) for i, (caption, url) in enumerate(zip(captions, urls)): plt.imshow(image_from_url(url)) plt.axis('off') caption_str = decode_captions(caption, data['idx_to_word']) plt.title(caption_str) plt.show() # # Recurrent Neural Networks # As discussed in lecture, we will use recurrent neural network (RNN) language models for image captioning. The file `cs231n/rnn_layers.py` contains implementations of different layer types that are needed for recurrent neural networks, and the file `cs231n/classifiers/rnn.py` uses these layers to implement an image captioning model. # # We will first implement different types of RNN layers in `cs231n/rnn_layers.py`. # # Vanilla RNN: step forward # Open the file `cs231n/rnn_layers.py`. This file implements the forward and backward passes for different types of layers that are commonly used in recurrent neural networks. # # First implement the function `rnn_step_forward` which implements the forward pass for a single timestep of a vanilla recurrent neural network. After doing so run the following to check your implementation. You should see errors less than 1e-8. # In[6]: N, D, H = 3, 10, 4 x = np.linspace(-0.4, 0.7, num=N*D).reshape(N, D) prev_h = np.linspace(-0.2, 0.5, num=N*H).reshape(N, H) Wx = np.linspace(-0.1, 0.9, num=D*H).reshape(D, H) Wh = np.linspace(-0.3, 0.7, num=H*H).reshape(H, H) b = np.linspace(-0.2, 0.4, num=H) next_h, _ = rnn_step_forward(x, prev_h, Wx, Wh, b) expected_next_h = np.asarray([ [-0.58172089, -0.50182032, -0.41232771, -0.31410098], [ 0.66854692, 0.79562378, 0.87755553, 0.92795967], [ 0.97934501, 0.99144213, 0.99646691, 0.99854353]]) print('next_h error: ', rel_error(expected_next_h, next_h)) # # Vanilla RNN: step backward # In the file `cs231n/rnn_layers.py` implement the `rnn_step_backward` function. After doing so run the following to numerically gradient check your implementation. You should see errors less than `1e-8`. # In[7]: from cs231n.rnn_layers import rnn_step_forward, rnn_step_backward np.random.seed(231) N, D, H = 4, 5, 6 x = np.random.randn(N, D) h = np.random.randn(N, H) Wx = np.random.randn(D, H) Wh = np.random.randn(H, H) b = np.random.randn(H) out, cache = rnn_step_forward(x, h, Wx, Wh, b) dnext_h = np.random.randn(*out.shape) fx = lambda x: rnn_step_forward(x, h, Wx, Wh, b)[0] fh = lambda prev_h: rnn_step_forward(x, h, Wx, Wh, b)[0] fWx = lambda Wx: rnn_step_forward(x, h, Wx, Wh, b)[0] fWh = lambda Wh: rnn_step_forward(x, h, Wx, Wh, b)[0] fb = lambda b: rnn_step_forward(x, h, Wx, Wh, b)[0] dx_num = eval_numerical_gradient_array(fx, x, dnext_h) dprev_h_num = eval_numerical_gradient_array(fh, h, dnext_h) dWx_num = eval_numerical_gradient_array(fWx, Wx, dnext_h) dWh_num = eval_numerical_gradient_array(fWh, Wh, dnext_h) db_num = eval_numerical_gradient_array(fb, b, dnext_h) dx, dprev_h, dWx, dWh, db = rnn_step_backward(dnext_h, cache) print('dx error: ', rel_error(dx_num, dx)) print('dprev_h error: ', rel_error(dprev_h_num, dprev_h)) print('dWx error: ', rel_error(dWx_num, dWx)) print('dWh error: ', rel_error(dWh_num, dWh)) print('db error: ', rel_error(db_num, db)) # # Vanilla RNN: forward # Now that you have implemented the forward and backward passes for a single timestep of a vanilla RNN, you will combine these pieces to implement a RNN that process an entire sequence of data. # # In the file `cs231n/rnn_layers.py`, implement the function `rnn_forward`. This should be implemented using the `rnn_step_forward` function that you defined above. After doing so run the following to check your implementation. You should see errors less than `1e-7`. # In[8]: N, T, D, H = 2, 3, 4, 5 x = np.linspace(-0.1, 0.3, num=N*T*D).reshape(N, T, D) h0 = np.linspace(-0.3, 0.1, num=N*H).reshape(N, H) Wx = np.linspace(-0.2, 0.4, num=D*H).reshape(D, H) Wh = np.linspace(-0.4, 0.1, num=H*H).reshape(H, H) b = np.linspace(-0.7, 0.1, num=H) h, _ = rnn_forward(x, h0, Wx, Wh, b) expected_h = np.asarray([ [ [-0.42070749, -0.27279261, -0.11074945, 0.05740409, 0.22236251], [-0.39525808, -0.22554661, -0.0409454, 0.14649412, 0.32397316], [-0.42305111, -0.24223728, -0.04287027, 0.15997045, 0.35014525], ], [ [-0.55857474, -0.39065825, -0.19198182, 0.02378408, 0.23735671], [-0.27150199, -0.07088804, 0.13562939, 0.33099728, 0.50158768], [-0.51014825, -0.30524429, -0.06755202, 0.17806392, 0.40333043]]]) print('h error: ', rel_error(expected_h, h)) # # Vanilla RNN: backward # In the file `cs231n/rnn_layers.py`, implement the backward pass for a vanilla RNN in the function `rnn_backward`. This should run back-propagation over the entire sequence, calling into the `rnn_step_backward` function that you defined above. You should see errors less than 5e-7. # In[9]: np.random.seed(231) N, D, T, H = 2, 3, 10, 5 x = np.random.randn(N, T, D) h0 = np.random.randn(N, H) Wx = np.random.randn(D, H) Wh = np.random.randn(H, H) b = np.random.randn(H) out, cache = rnn_forward(x, h0, Wx, Wh, b) dout = np.random.randn(*out.shape) dx, dh0, dWx, dWh, db = rnn_backward(dout, cache) fx = lambda x: rnn_forward(x, h0, Wx, Wh, b)[0] fh0 = lambda h0: rnn_forward(x, h0, Wx, Wh, b)[0] fWx = lambda Wx: rnn_forward(x, h0, Wx, Wh, b)[0] fWh = lambda Wh: rnn_forward(x, h0, Wx, Wh, b)[0] fb = lambda b: rnn_forward(x, h0, Wx, Wh, b)[0] dx_num = eval_numerical_gradient_array(fx, x, dout) dh0_num = eval_numerical_gradient_array(fh0, h0, dout) dWx_num = eval_numerical_gradient_array(fWx, Wx, dout) dWh_num = eval_numerical_gradient_array(fWh, Wh, dout) db_num = eval_numerical_gradient_array(fb, b, dout) print('dx error: ', rel_error(dx_num, dx)) print('dh0 error: ', rel_error(dh0_num, dh0)) print('dWx error: ', rel_error(dWx_num, dWx)) print('dWh error: ', rel_error(dWh_num, dWh)) print('db error: ', rel_error(db_num, db)) # # Word embedding: forward # In deep learning systems, we commonly represent words using vectors. Each word of the vocabulary will be associated with a vector, and these vectors will be learned jointly with the rest of the system. # # In the file `cs231n/rnn_layers.py`, implement the function `word_embedding_forward` to convert words (represented by integers) into vectors. Run the following to check your implementation. You should see error around `1e-8`. # In[10]: N, T, V, D = 2, 4, 5, 3 x = np.asarray([[0, 3, 1, 2], [2, 1, 0, 3]]) W = np.linspace(0, 1, num=V*D).reshape(V, D) out, _ = word_embedding_forward(x, W) expected_out = np.asarray([ [[ 0., 0.07142857, 0.14285714], [ 0.64285714, 0.71428571, 0.78571429], [ 0.21428571, 0.28571429, 0.35714286], [ 0.42857143, 0.5, 0.57142857]], [[ 0.42857143, 0.5, 0.57142857], [ 0.21428571, 0.28571429, 0.35714286], [ 0., 0.07142857, 0.14285714], [ 0.64285714, 0.71428571, 0.78571429]]]) print('out error: ', rel_error(expected_out, out)) # # Word embedding: backward # Implement the backward pass for the word embedding function in the function `word_embedding_backward`. After doing so run the following to numerically gradient check your implementation. You should see errors less than `1e-11`. # In[11]: np.random.seed(231) N, T, V, D = 50, 3, 5, 6 x = np.random.randint(V, size=(N, T)) W = np.random.randn(V, D) out, cache = word_embedding_forward(x, W) dout = np.random.randn(*out.shape) dW = word_embedding_backward(dout, cache) f = lambda W: word_embedding_forward(x, W)[0] dW_num = eval_numerical_gradient_array(f, W, dout) print('dW error: ', rel_error(dW, dW_num)) # # Temporal Affine layer # At every timestep we use an affine function to transform the RNN hidden vector at that timestep into scores for each word in the vocabulary. Because this is very similar to the affine layer that you implemented in assignment 2, we have provided this function for you in the `temporal_affine_forward` and `temporal_affine_backward` functions in the file `cs231n/rnn_layers.py`. Run the following to perform numeric gradient checking on the implementation. You should see errors less than 1e-9. # In[12]: np.random.seed(231) # Gradient check for temporal affine layer N, T, D, M = 2, 3, 4, 5 x = np.random.randn(N, T, D) w = np.random.randn(D, M) b = np.random.randn(M) out, cache = temporal_affine_forward(x, w, b) dout = np.random.randn(*out.shape) fx = lambda x: temporal_affine_forward(x, w, b)[0] fw = lambda w: temporal_affine_forward(x, w, b)[0] fb = lambda b: temporal_affine_forward(x, w, b)[0] dx_num = eval_numerical_gradient_array(fx, x, dout) dw_num = eval_numerical_gradient_array(fw, w, dout) db_num = eval_numerical_gradient_array(fb, b, dout) dx, dw, db = temporal_affine_backward(dout, cache) print('dx error: ', rel_error(dx_num, dx)) print('dw error: ', rel_error(dw_num, dw)) print('db error: ', rel_error(db_num, db)) # # Temporal Softmax loss # In an RNN language model, at every timestep we produce a score for each word in the vocabulary. We know the ground-truth word at each timestep, so we use a softmax loss function to compute loss and gradient at each timestep. We sum the losses over time and average them over the minibatch. # # However there is one wrinkle: since we operate over minibatches and different captions may have different lengths, we append `` tokens to the end of each caption so they all have the same length. We don't want these `` tokens to count toward the loss or gradient, so in addition to scores and ground-truth labels our loss function also accepts a `mask` array that tells it which elements of the scores count towards the loss. # # Since this is very similar to the softmax loss function you implemented in assignment 1, we have implemented this loss function for you; look at the `temporal_softmax_loss` function in the file `cs231n/rnn_layers.py`. # # Run the following cell to sanity check the loss and perform numeric gradient checking on the function. You should see an error for dx less than 1e-7. # In[14]: # Sanity check for temporal softmax loss from cs231n.rnn_layers import temporal_softmax_loss N, T, V = 100, 1, 10 def check_loss(N, T, V, p): x = 0.001 * np.random.randn(N, T, V) y = np.random.randint(V, size=(N, T)) mask = np.random.rand(N, T) <= p print(temporal_softmax_loss(x, y, mask)[0]) check_loss(100, 1, 10, 1.0) # Should be about 2.3 check_loss(100, 10, 10, 1.0) # Should be about 23 check_loss(5000, 10, 10, 0.1) # Should be about 2.3 # Gradient check for temporal softmax loss N, T, V = 7, 8, 9 x = np.random.randn(N, T, V) y = np.random.randint(V, size=(N, T)) mask = (np.random.rand(N, T) > 0.5) loss, dx = temporal_softmax_loss(x, y, mask, verbose=False) dx_num = eval_numerical_gradient(lambda x: temporal_softmax_loss(x, y, mask)[0], x, verbose=False) print('dx error: ', rel_error(dx, dx_num)) # # RNN for image captioning # Now that you have implemented the necessary layers, you can combine them to build an image captioning model. Open the file `cs231n/classifiers/rnn.py` and look at the `CaptioningRNN` class. # # Implement the forward and backward pass of the model in the `loss` function. For now you only need to implement the case where `cell_type='rnn'` for vanialla RNNs; you will implement the LSTM case later. After doing so, run the following to check your forward pass using a small test case; you should see error less than `1e-10`. # In[16]: N, D, W, H = 10, 20, 30, 40 word_to_idx = {'': 0, 'cat': 2, 'dog': 3} V = len(word_to_idx) T = 13 model = CaptioningRNN(word_to_idx, input_dim=D, wordvec_dim=W, hidden_dim=H, cell_type='rnn', dtype=np.float64) # Set all model parameters to fixed values for k, v in model.params.items(): model.params[k] = np.linspace(-1.4, 1.3, num=v.size).reshape(*v.shape) features = np.linspace(-1.5, 0.3, num=(N * D)).reshape(N, D) captions = (np.arange(N * T) % V).reshape(N, T) loss, grads = model.loss(features, captions) expected_loss = 9.83235591003 print('loss: ', loss) print('expected loss: ', expected_loss) print('difference: ', abs(loss - expected_loss)) # Run the following cell to perform numeric gradient checking on the `CaptioningRNN` class; you should errors around `5e-6` or less. # In[17]: np.random.seed(231) batch_size = 2 timesteps = 3 input_dim = 4 wordvec_dim = 5 hidden_dim = 6 word_to_idx = {'': 0, 'cat': 2, 'dog': 3} vocab_size = len(word_to_idx) captions = np.random.randint(vocab_size, size=(batch_size, timesteps)) features = np.random.randn(batch_size, input_dim) model = CaptioningRNN(word_to_idx, input_dim=input_dim, wordvec_dim=wordvec_dim, hidden_dim=hidden_dim, cell_type='rnn', dtype=np.float64, ) loss, grads = model.loss(features, captions) for param_name in sorted(grads): f = lambda _: model.loss(features, captions)[0] param_grad_num = eval_numerical_gradient(f, model.params[param_name], verbose=False, h=1e-6) e = rel_error(param_grad_num, grads[param_name]) print('%s relative error: %e' % (param_name, e)) # # Overfit small data # Similar to the `Solver` class that we used to train image classification models on the previous assignment, on this assignment we use a `CaptioningSolver` class to train image captioning models. Open the file `cs231n/captioning_solver.py` and read through the `CaptioningSolver` class; it should look very familiar. # # Once you have familiarized yourself with the API, run the following to make sure your model overfit a small sample of 100 training examples. You should see losses of less than 0.1. # In[18]: np.random.seed(231) small_data = load_coco_data(max_train=50) small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=data['word_to_idx'], input_dim=data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver(small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() # # Test-time sampling # Unlike classification models, image captioning models behave very differently at training time and at test time. At training time, we have access to the ground-truth caption, so we feed ground-truth words as input to the RNN at each timestep. At test time, we sample from the distribution over the vocabulary at each timestep, and feed the sample as input to the RNN at the next timestep. # # In the file `cs231n/classifiers/rnn.py`, implement the `sample` method for test-time sampling. After doing so, run the following to sample from your overfitted model on both training and validation data. The samples on training data should be very good; the samples on validation data probably won't make sense. # In[19]: for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = decode_captions(sample_captions, data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show()