#!/usr/bin/env python # coding: utf-8 # # Vanilla LSTM with CuPy # # This is a port of [Vanilla LSTM with numpy](http://blog.varunajayasiri.com/numpy_lstm.html) that shows how to run the numpy-based machine learning code on GPU using [CuPy](https://github.com/cupy/cupy). The all contents below is basically copied from the article: [Vanilla LSTM with numpy](http://blog.varunajayasiri.com/numpy_lstm.html). # # This is inspired from [Minimal character-level language model with a Vanilla Recurrent Neural Network](https://gist.github.com/karpathy/d4dee566867f8291f086), in Python/numpy by [Andrej Karpathy](https://github.com/karpathy). # # The model usually reaches an error of about 45 after 5000 iterations when tested with [100,000 character sample from Shakespeare](http://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt). However it sometimes get stuck in a local minima; reinitialize the weights if this happens. # # You need to place the input text file as `input.txt` in the same folder as the python code. # ## Download data # In[1]: get_ipython().run_cell_magic('bash', '', 'curl -L http://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt -o input.txt\n') # ## Imports # In[2]: import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt from IPython import display np.random.seed(2017) # The above imports are from the original article as is. But in this article, you can run almost all computation on GPU by just replacing `np` with `cp`. Well, `cp` is just another name of `cupy`. # In[3]: import cupy as cp cp.random.seed(2017) # To show the difference of computational time by switching CPU and GPU simply, let's use `xp` instead of `np` and switch the referenced package between `numpy` and `cupy`. # In[4]: # If you use CPU # xp = np # If you use GPU: almost 2 times faster xp = cp # **Because CuPy has high compatibility with NumPy, the same code using NumPy is easily converted for CuPy by just replacing `numpy` with `cupy`**. # **Note:** # # If the `H_size` is larger than 1000 or so, training on GPU, namely, using CuPy is several times faster than NumPy. But when the array size is small, the speed gain is often small. # ## Read and process data # In[5]: data = open('input.txt', 'r').read() # #### Process data and calculate indexes # In[6]: chars = list(set(data)) data_size, X_size = len(data), len(chars) print("data has %d characters, %d unique" % (data_size, X_size)) char_to_idx = {ch:i for i,ch in enumerate(chars)} idx_to_char = {i:ch for i,ch in enumerate(chars)} # ## Parameters # In[7]: H_size = 100 # Size of the hidden layer T_steps = 25 # Number of time steps (length of the sequence) used for training learning_rate = 1e-1 # Learning rate weight_sd = 0.1 # Standard deviation of weights for initialization z_size = H_size + X_size # Size of concatenate(H, X) vector # ## Activation Functions and Derivatives # # ### Sigmoid # # $$ # \begin{eqnarray} # \sigma(x) &=& \frac{1}{1 + e^{-x}} \\ # \frac{d \sigma(x)}{d x} &=& \sigma(x) \cdot (1 - \sigma(x)) # \end{eqnarray} # $$ # # ### Tanh # # $$ # \frac{d \tanh(x)}{dx} = 1 - \tanh^2(x) # $$ # In[8]: def sigmoid(x): return 1 / (1 + xp.exp(-x)) def dsigmoid(y): return y * (1 - y) def tanh(x): return xp.tanh(x) def dtanh(y): return 1 - y * y # ## Initialize weights # # We use random weights with normal distribution `(0, weight_sd)` for `tanh` activation function and `(0.5, weight_sd)` for `sigmoid` activation function. # # Biases are initialized to zeros. # # Formulae for LSTM are shown below. # In[9]: W_f = (xp.random.randn(H_size, z_size) * weight_sd + 0.5).astype(xp.float32) b_f = xp.zeros((H_size, 1), dtype=xp.float32) W_i = (xp.random.randn(H_size, z_size) * weight_sd + 0.5).astype(xp.float32) b_i = xp.zeros((H_size, 1), dtype=xp.float32) W_C = (xp.random.randn(H_size, z_size) * weight_sd).astype(xp.float32) b_C = xp.zeros((H_size, 1), dtype=xp.float32) W_o = (xp.random.randn(H_size, z_size) * weight_sd + 0.5).astype(xp.float32) b_o = xp.zeros((H_size, 1), dtype=xp.float32) #For final layer to predict the next character W_y = (xp.random.randn(X_size, H_size) * weight_sd).astype(xp.float32) b_y = xp.zeros((X_size, 1)) # ## Gradients # In[10]: dW_f = xp.zeros_like(W_f, dtype=xp.float32) dW_i = xp.zeros_like(W_i, dtype=xp.float32) dW_C = xp.zeros_like(W_C, dtype=xp.float32) dW_o = xp.zeros_like(W_o, dtype=xp.float32) dW_y = xp.zeros_like(W_y, dtype=xp.float32) db_f = xp.zeros_like(b_f, dtype=xp.float32) db_i = xp.zeros_like(b_i, dtype=xp.float32) db_C = xp.zeros_like(b_C, dtype=xp.float32) db_o = xp.zeros_like(b_o, dtype=xp.float32) db_y = xp.zeros_like(b_y, dtype=xp.float32) # ### Forward pass # # ![](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png) # # Image taken from [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/). Please read the article for a good explanation of LSTMs. # # ### Concatenation of $h_{t-1}$ and $x_t$ # # $$ # z = [h_{t-1}, x_t] # $$ # # ### LSTM functions # # $$ # \begin{eqnarray} # f_t &=& \sigma(W_f \cdot z + b_f) \\ # i_t &=& \sigma(W_i \cdot z + b_i) \\ # \bar{C}_t &=& \tanh(W_C \cdot z + b_C) \\ # C_t &=& f_t \ast C_{t-1} + i_t \ast \bar{C}_t \\ # o_t &=& \sigma(W_O \cdot z + b_t) \\ # h_t &=& o_t \ast \tanh(C_t) # \end{eqnarray} # $$ # # ### Logits # # $$ # y_t = W_y \cdot h_t + b_y # $$ # # ### Softmax # # $$ # \hat{p}_t = {\rm softmax}(y_t) # $$ # # $\hat{p}_t$ is `p` in code and $p_t$ is `targets`. # In[11]: def forward(x, h_prev, C_prev): assert x.shape == (X_size, 1) assert h_prev.shape == (H_size, 1) assert C_prev.shape == (H_size, 1) z = xp.concatenate((h_prev, x)) f = sigmoid(xp.dot(W_f, z) + b_f) i = sigmoid(xp.dot(W_i, z) + b_i) C_bar = tanh(xp.dot(W_C, z) + b_C) C = f * C_prev + i * C_bar o = sigmoid(xp.dot(W_o, z) + b_o) h = o * tanh(C) y = xp.dot(W_y, h) + b_y y -= y.max() p = xp.exp(y) / xp.sum(xp.exp(y)) return z, f, i, C_bar, C, o, h, y, p # ## Backward pass # # ### Loss # # $$ # \mathcal{L} = - \sum p_{t,j}log \hat{p}_{t,j} # $$ # # ### Gradients # # $$ # \begin{eqnarray} # dy_t &=& \hat{p}_t - p_t \\ # dh_t &=& dh'_{t+1} + W_y^T \cdot d_y \\ # do_t &=& dh_t \ast \tanh (C_t) \\ # dC_t &=& dC'_{t+q} + dh_t \ast o_t \ast (1 - \tanh^2(C_t)) \\ # d \bar{C}_t &=& d C_t \ast i_t \\ # d i_t &=& d C_t \ast \bar{C}_t \\ # d f_t &=& d C_t \ast C_{t-1} \\ # d z_t &=& W_f^T \cdot df_t + W_i^T \cdot di_t + W_C^T \cdot \bar{C}_t + W^T_o \cdot do_t \\ # [dh'_t, dx_t] &=& dz_t \\ # dC'_t &=& f \ast dC_t # \end{eqnarray} # $$ # # - `target` is target character index $p_t$ # - `dh_next` is $dh_{t+1}$ (size $H \times 1$) # - `dC_next` is $dC_{t+1}$ (size $H \times 1$) # - `C_prev` is $C_{t-1}$ (size $H \times 1$) # - Returns $dh_t$ and $dC_t$ # In[12]: def backward(target, dh_next, dC_next, C_prev, z, f, i, C_bar, C, o, h, y, p): global dW_f, dW_i, dW_C, dW_o, dW_y global db_f, db_i, db_C, db_o, db_y assert z.shape == (X_size + H_size, 1) assert y.shape == (X_size, 1) assert p.shape == (X_size, 1) for param in [dh_next, dC_next, C_prev, f, i, C_bar, C, o, h]: assert param.shape == (H_size, 1) dy = xp.copy(p) dy[target] -= 1 dW_y += xp.dot(dy, h.T) db_y += dy dh = xp.dot(W_y.T, dy) dh += dh_next do = dh * tanh(C) do = dsigmoid(o) * do dW_o += xp.dot(do, z.T) db_o += do dC = xp.copy(dC_next) dC += dh * o * dtanh(tanh(C)) dC_bar = dC * i dC_bar = dC_bar * dtanh(C_bar) dW_C += xp.dot(dC_bar, z.T) db_C += dC_bar di = dC * C_bar di = dsigmoid(i) * di dW_i += xp.dot(di, z.T) db_i += di df = dC * C_prev df = dsigmoid(f) * df dW_f += xp.dot(df, z.T) db_f += df dz = xp.dot(W_f.T, df) \ + xp.dot(W_i.T, di) \ + xp.dot(W_C.T, dC_bar) \ + xp.dot(W_o.T, do) dh_prev = dz[:H_size, :] dC_prev = f * dC return dh_prev, dC_prev # ## Forward Backward Pass # # Calculate and store the values in forward pass. Accumulate gradients in backward pass and clip gradients to avoid exploding gradients. # # - `input`, `target` are list of integers, with character indexes. # - `h_prev` is the array of initial `h` at $h_1$ (size $H \times 1$) # - `C_prev` is the array of initial `C` at $C_1$ (size $H \times 1$) # - Returns loss, final $h_T$ and $C_T$ # In[13]: def forward_backward(inputs, targets, h_prev, C_prev): # To store the values for each time step x_s, z_s, f_s, i_s, C_bar_s, C_s, o_s, h_s, y_s, p_s = {}, {}, {}, {}, {}, {}, {}, {}, {}, {} # Values at t - 1 h_s[-1] = xp.copy(h_prev) C_s[-1] = xp.copy(C_prev) loss = 0 # Loop through time steps assert len(inputs) == T_steps for t in range(len(inputs)): x_s[t] = xp.zeros((X_size, 1), dtype=xp.float32) x_s[t][inputs[t]] = 1 # Input character z_s[t], f_s[t], i_s[t], C_bar_s[t], C_s[t], o_s[t], h_s[t], y_s[t], p_s[t] \ = forward(x_s[t], h_s[t - 1], C_s[t - 1]) # Forward pass loss += -xp.log(p_s[t][targets[t], 0]) # Loss for at t for dparam in [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y]: dparam.fill(0) dh_next = xp.zeros_like(h_s[0], dtype=xp.float32) #dh from the next character dC_next = xp.zeros_like(C_s[0], dtype=xp.float32) #dh from the next character for t in reversed(range(len(inputs))): # Backward pass dh_next, dC_next = backward(target = targets[t], dh_next = dh_next, dC_next = dC_next, C_prev = C_s[t-1], z = z_s[t], f = f_s[t], i = i_s[t], C_bar = C_bar_s[t], C = C_s[t], o = o_s[t], h = h_s[t], y = y_s[t], p = p_s[t]) # Clip gradients to mitigate exploding gradients for dparam in [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y]: xp.clip(dparam, -1, 1, out=dparam) return loss, h_s[len(inputs) - 1], C_s[len(inputs) - 1] # ## Sample the next character # In[14]: def sample(h_prev, C_prev, first_char_idx, sentence_length): x = xp.zeros((X_size, 1)) x[first_char_idx] = 1 h = h_prev C = C_prev indexes = [] for t in range(sentence_length): _, _, _, _, C, _, h, _, p = forward(x, h, C) assert xp.all(p.ravel() >= 0) idx = xp.random.choice(xp.arange(X_size), size=(1,), p=p.ravel())[0] x = xp.zeros((X_size, 1), dtype=xp.float32) x[idx] = 1 indexes.append(int(idx)) return indexes # ## Training (Adagrad) # # $$ # w = w - \eta \frac{dw}{\sum dw^2_{\tau}} # $$ # In[15]: def update_status(inputs, h_prev, C_prev): #initialized later global plot_iter, plot_loss global smooth_loss # Get predictions for 200 letters with current model display.clear_output(wait=True) sample_idx = sample(h_prev, C_prev, inputs[0], 200) txt = ''.join(idx_to_char[idx] for idx in sample_idx) # Clear and plot plt.clf() plt.plot(plot_iter, plot_loss) display.display(plt.gcf()) #Print prediction and loss print("----\n %s \n----" % (txt, )) print("iter %d, loss %f" % (iteration, smooth_loss)) # Memory variables for Adagrad # In[16]: mW_f = xp.zeros_like(W_f, dtype=xp.float32) mW_i = xp.zeros_like(W_i, dtype=xp.float32) mW_C = xp.zeros_like(W_C, dtype=xp.float32) mW_o = xp.zeros_like(W_o, dtype=xp.float32) mW_y = xp.zeros_like(W_y, dtype=xp.float32) mb_f = xp.zeros_like(b_f, dtype=xp.float32) mb_i = xp.zeros_like(b_i, dtype=xp.float32) mb_C = xp.zeros_like(b_C, dtype=xp.float32) mb_o = xp.zeros_like(b_o, dtype=xp.float32) mb_y = xp.zeros_like(b_y, dtype=xp.float32) # In[17]: # Exponential average of loss # Initialize to a error of a random model smooth_loss = (-xp.log(1.0 / X_size) * T_steps).astype(xp.float32) iteration, p = 0, 0 # For the graph plot_iter = np.zeros((0)) plot_loss = np.zeros((0)) # In[18]: get_ipython().run_cell_magic('time', '', 'while True:\n # Try catch for interruption\n try:\n # Reset\n if p + T_steps >= len(data) or iteration == 0:\n g_h_prev = xp.zeros((H_size, 1), dtype=xp.float32)\n g_C_prev = xp.zeros((H_size, 1), dtype=xp.float32)\n p = 0\n\n inputs = [char_to_idx[ch] for ch in data[p: p + T_steps]]\n targets = [char_to_idx[ch] for ch in data[p + 1: p + T_steps + 1]]\n\n loss, g_h_prev, g_C_prev = forward_backward(inputs, targets, g_h_prev, g_C_prev)\n smooth_loss = smooth_loss * 0.999 + loss * 0.001\n\n # Print every hundred steps\n if iteration % 100 == 0:\n update_status(inputs, g_h_prev, g_C_prev)\n\n # Update weights\n for param, dparam, mem in zip([W_f, W_i, W_C, W_o, W_y, b_f, b_i, b_C, b_o, b_y],\n [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y],\n [mW_f, mW_i, mW_C, mW_o, mW_y, mb_f, mb_i, mb_C, mb_o, mb_y]):\n mem += dparam * dparam # Calculate sum of gradients\n #print(learning_rate * dparam)\n param += -(learning_rate * dparam / xp.sqrt(mem + 1e-8))\n\n plot_iter = np.append(plot_iter, [iteration])\n if isinstance(loss, cp.ndarray):\n loss = loss.get()\n plot_loss = np.append(plot_loss, [loss])\n\n p += T_steps\n iteration += 1\n if iteration > 6000:\n break\n except KeyboardInterrupt:\n update_status(inputs, g_h_prev, g_C_prev)\n break\n') # ## Gradient Check # # Approximate the numerical gradients by changing parameters and running the model. Check if the approximated gradients are equal to the computed analytical gradients (by backpropagation). # # Try this on `num_checks` individual paramters picked randomly for each weight matrix and bias vector. # In[19]: from random import uniform def gradient_check(inputs, target, h_prev, C_prev): global W_f, W_i, W_C, W_o, W_y, b_f, b_i, b_C, b_o, b_y global dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y num_checks = 10 # Number of parameters to test delta = 1e-5 # The change to make on the parameter # To calculate computed gradients _, _, _ = forward_backward(inputs, targets, h_prev, C_prev) for param, dparam, name in zip([W_f, W_i, W_C, W_o, W_y, b_f, b_i, b_C, b_o, b_y], [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y], ['W_f', 'W_i', 'W_C', 'W_o', 'W_y', 'b_f', 'b_i', 'b_C', 'b_o', 'b_y']): assert param.shape == dparam.shape dparam_copy = xp.copy(dparam) #Make a copy because this will get modified # Test num_checks times for i in range(num_checks): # Pick a random index rnd_idx = int(uniform(0,param.size)) # evaluate cost at [x + delta] and [x - delta] old_val = xp.ravel(param)[rnd_idx] xp.ravel(param)[rnd_idx] = old_val + delta loss_plus_delta, _, _ = forward_backward(inputs, targets, h_prev, C_prev) xp.ravel(param)[rnd_idx] = old_val - delta loss_mins_delta, _, _ = forward_backward(inputs, targets, h_prev, C_prev) xp.ravel(param)[rnd_idx] = old_val grad_analytical = xp.ravel(dparam_copy)[rnd_idx] grad_numerical = (loss_plus_delta - loss_mins_delta) / (2 * delta) # Clip numerical error because grad_analytical is clipped grad_numerical = xp.clip(grad_numerical, -1, 1) err_sum = abs(grad_numerical + grad_analytical) + 1e-09 rel_error = abs(grad_analytical - grad_numerical) / err_sum # If relative error is greater than 1e-06 if rel_error > 1e-06: print('%s (%e, %e) => %e' % (name, grad_numerical, grad_analytical, rel_error)) # In[20]: gradient_check(inputs, targets, g_h_prev, g_C_prev)