#!/usr/bin/env python
# coding: utf-8

# # Vanilla LSTM with CuPy
# 
# This is a port of [Vanilla LSTM with numpy](http://blog.varunajayasiri.com/numpy_lstm.html) that shows how to run the numpy-based machine learning code on GPU using [CuPy](https://github.com/cupy/cupy). The all contents below is basically copied from the article: [Vanilla LSTM with numpy](http://blog.varunajayasiri.com/numpy_lstm.html).
# 
# This is inspired from [Minimal character-level language model with a Vanilla Recurrent Neural Network](https://gist.github.com/karpathy/d4dee566867f8291f086), in Python/numpy by [Andrej Karpathy](https://github.com/karpathy).
# 
# The model usually reaches an error of about 45 after 5000 iterations when tested with [100,000 character sample from Shakespeare](http://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt). However it sometimes get stuck in a local minima; reinitialize the weights if this happens.
# 
# You need to place the input text file as `input.txt` in the same folder as the python code.

# ## Download data

# In[1]:


get_ipython().run_cell_magic('bash', '', 'curl -L http://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt -o input.txt\n')


# ## Imports

# In[2]:


import numpy as np
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from IPython import display

np.random.seed(2017)


# The above imports are from the original article as is. But in this article, you can run almost all computation on GPU by just replacing `np` with `cp`. Well, `cp` is just another name of `cupy`.

# In[3]:


import cupy as cp

cp.random.seed(2017)


# To show the difference of computational time by switching CPU and GPU simply, let's use `xp` instead of `np` and switch the referenced package between `numpy` and `cupy`.

# In[4]:


# If you use CPU
# xp = np

# If you use GPU: almost 2 times faster
xp = cp


# **Because CuPy has high compatibility with NumPy, the same code using NumPy is easily converted for CuPy by just replacing `numpy` with `cupy`**.

# **Note:**
# 
# If the `H_size` is larger than 1000 or so, training on GPU, namely, using CuPy is several times faster than NumPy. But when the array size is small, the speed gain is often small.

# ## Read and process data

# In[5]:


data = open('input.txt', 'r').read()


# #### Process data and calculate indexes

# In[6]:


chars = list(set(data))
data_size, X_size = len(data), len(chars)
print("data has %d characters, %d unique" % (data_size, X_size))
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}


# ## Parameters

# In[7]:


H_size = 100 # Size of the hidden layer
T_steps = 25 # Number of time steps (length of the sequence) used for training
learning_rate = 1e-1 # Learning rate
weight_sd = 0.1 # Standard deviation of weights for initialization
z_size = H_size + X_size # Size of concatenate(H, X) vector


# ## Activation Functions and Derivatives
# 
# ### Sigmoid
# 
# $$
# \begin{eqnarray}
# \sigma(x) &=& \frac{1}{1 + e^{-x}} \\
# \frac{d \sigma(x)}{d x} &=& \sigma(x) \cdot (1 - \sigma(x))
# \end{eqnarray}
# $$
# 
# ### Tanh
# 
# $$
# \frac{d \tanh(x)}{dx} = 1 - \tanh^2(x)
# $$

# In[8]:


def sigmoid(x):
    return 1 / (1 + xp.exp(-x))


def dsigmoid(y):
    return y * (1 - y)


def tanh(x):
    return xp.tanh(x)


def dtanh(y):
    return 1 - y * y


# ## Initialize weights
# 
# We use random weights with normal distribution `(0, weight_sd)` for `tanh` activation function and `(0.5, weight_sd)` for `sigmoid` activation function.
# 
# Biases are initialized to zeros.
# 
# Formulae for LSTM are shown below.

# In[9]:


W_f = (xp.random.randn(H_size, z_size) * weight_sd + 0.5).astype(xp.float32)
b_f = xp.zeros((H_size, 1), dtype=xp.float32)

W_i = (xp.random.randn(H_size, z_size) * weight_sd + 0.5).astype(xp.float32)
b_i = xp.zeros((H_size, 1), dtype=xp.float32)

W_C = (xp.random.randn(H_size, z_size) * weight_sd).astype(xp.float32)
b_C = xp.zeros((H_size, 1), dtype=xp.float32)

W_o = (xp.random.randn(H_size, z_size) * weight_sd + 0.5).astype(xp.float32)
b_o = xp.zeros((H_size, 1), dtype=xp.float32)

#For final layer to predict the next character
W_y = (xp.random.randn(X_size, H_size) * weight_sd).astype(xp.float32)
b_y = xp.zeros((X_size, 1))


# ## Gradients

# In[10]:


dW_f = xp.zeros_like(W_f, dtype=xp.float32)
dW_i = xp.zeros_like(W_i, dtype=xp.float32)
dW_C = xp.zeros_like(W_C, dtype=xp.float32)

dW_o = xp.zeros_like(W_o, dtype=xp.float32)
dW_y = xp.zeros_like(W_y, dtype=xp.float32)

db_f = xp.zeros_like(b_f, dtype=xp.float32)
db_i = xp.zeros_like(b_i, dtype=xp.float32)
db_C = xp.zeros_like(b_C, dtype=xp.float32)

db_o = xp.zeros_like(b_o, dtype=xp.float32)
db_y = xp.zeros_like(b_y, dtype=xp.float32)


# ### Forward pass
# 
# ![](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)
# 
# Image taken from [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/). Please read the article for a good explanation of LSTMs.
# 
# ### Concatenation of $h_{t-1}$ and $x_t$
# 
# $$
# z = [h_{t-1}, x_t]
# $$
# 
# ### LSTM functions
# 
# $$
# \begin{eqnarray}
# f_t &=& \sigma(W_f \cdot z + b_f) \\
# i_t &=& \sigma(W_i \cdot z + b_i) \\
# \bar{C}_t &=& \tanh(W_C \cdot z + b_C) \\
# C_t &=& f_t \ast C_{t-1} + i_t \ast \bar{C}_t \\
# o_t &=& \sigma(W_O \cdot z + b_t) \\
# h_t &=& o_t \ast \tanh(C_t)
# \end{eqnarray}
# $$
# 
# ### Logits
# 
# $$
# y_t = W_y \cdot h_t + b_y
# $$
# 
# ### Softmax
# 
# $$
# \hat{p}_t = {\rm softmax}(y_t)
# $$
# 
# $\hat{p}_t$ is `p` in code and $p_t$ is `targets`.

# In[11]:


def forward(x, h_prev, C_prev):
    assert x.shape == (X_size, 1)
    assert h_prev.shape == (H_size, 1)
    assert C_prev.shape == (H_size, 1)

    z = xp.concatenate((h_prev, x))
    f = sigmoid(xp.dot(W_f, z) + b_f)
    i = sigmoid(xp.dot(W_i, z) + b_i)
    C_bar = tanh(xp.dot(W_C, z) + b_C)

    C = f * C_prev + i * C_bar
    o = sigmoid(xp.dot(W_o, z) + b_o)
    h = o * tanh(C)

    y = xp.dot(W_y, h) + b_y
    y -= y.max()
    p = xp.exp(y) / xp.sum(xp.exp(y))

    return z, f, i, C_bar, C, o, h, y, p


# ## Backward pass
# 
# ### Loss
# 
# $$
# \mathcal{L} = - \sum p_{t,j}log \hat{p}_{t,j}
# $$
# 
# ### Gradients
# 
# $$
# \begin{eqnarray}
# dy_t &=& \hat{p}_t - p_t \\
# dh_t &=& dh'_{t+1} + W_y^T \cdot d_y \\
# do_t &=& dh_t \ast \tanh (C_t) \\
# dC_t &=& dC'_{t+q} + dh_t \ast o_t \ast (1 - \tanh^2(C_t)) \\
# d \bar{C}_t &=& d C_t \ast i_t \\
# d i_t &=& d C_t \ast \bar{C}_t \\
# d f_t &=& d C_t \ast C_{t-1} \\
# d z_t &=& W_f^T \cdot df_t + W_i^T \cdot di_t + W_C^T \cdot \bar{C}_t + W^T_o \cdot do_t \\
# [dh'_t, dx_t] &=& dz_t \\
# dC'_t &=& f \ast dC_t
# \end{eqnarray}
# $$
# 
# - `target` is target character index $p_t$
# - `dh_next` is $dh_{t+1}$ (size $H \times 1$)
# - `dC_next` is $dC_{t+1}$ (size $H \times 1$)
# - `C_prev` is $C_{t-1}$ (size $H \times 1$)
# - Returns $dh_t$ and $dC_t$

# In[12]:


def backward(target, dh_next, dC_next, C_prev, z, f, i, C_bar, C, o, h, y, p):

    global dW_f, dW_i, dW_C, dW_o, dW_y
    global db_f, db_i, db_C, db_o, db_y

    assert z.shape == (X_size + H_size, 1)
    assert y.shape == (X_size, 1)
    assert p.shape == (X_size, 1)

    for param in [dh_next, dC_next, C_prev, f, i, C_bar, C, o, h]:
        assert param.shape == (H_size, 1)

    dy = xp.copy(p)
    dy[target] -= 1

    dW_y += xp.dot(dy, h.T)
    db_y += dy

    dh = xp.dot(W_y.T, dy)
    dh += dh_next
    do = dh * tanh(C)
    do = dsigmoid(o) * do
    dW_o += xp.dot(do, z.T)
    db_o += do

    dC = xp.copy(dC_next)
    dC += dh * o * dtanh(tanh(C))
    dC_bar = dC * i
    dC_bar = dC_bar * dtanh(C_bar)
    dW_C += xp.dot(dC_bar, z.T)
    db_C += dC_bar

    di = dC * C_bar
    di = dsigmoid(i) * di
    dW_i += xp.dot(di, z.T)
    db_i += di

    df = dC * C_prev
    df = dsigmoid(f) * df
    dW_f += xp.dot(df, z.T)
    db_f += df

    dz = xp.dot(W_f.T, df) \
        + xp.dot(W_i.T, di) \
        + xp.dot(W_C.T, dC_bar) \
        + xp.dot(W_o.T, do)
    dh_prev = dz[:H_size, :]
    dC_prev = f * dC

    return dh_prev, dC_prev


# ## Forward Backward Pass
# 
# Calculate and store the values in forward pass. Accumulate gradients in backward pass and clip gradients to avoid exploding gradients.
# 
# - `input`, `target` are list of integers, with character indexes.
# - `h_prev` is the array of initial `h` at $h_1$ (size $H \times 1$)
# - `C_prev` is the array of initial `C` at $C_1$ (size $H \times 1$)
# - Returns loss, final $h_T$ and $C_T$

# In[13]:


def forward_backward(inputs, targets, h_prev, C_prev):
    # To store the values for each time step
    x_s, z_s, f_s, i_s, C_bar_s, C_s, o_s, h_s, y_s, p_s = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}

    # Values at t - 1
    h_s[-1] = xp.copy(h_prev)
    C_s[-1] = xp.copy(C_prev)

    loss = 0
    # Loop through time steps
    assert len(inputs) == T_steps
    for t in range(len(inputs)):
        x_s[t] = xp.zeros((X_size, 1), dtype=xp.float32)
        x_s[t][inputs[t]] = 1 # Input character

        z_s[t], f_s[t], i_s[t], C_bar_s[t], C_s[t], o_s[t], h_s[t], y_s[t], p_s[t] \
            = forward(x_s[t], h_s[t - 1], C_s[t - 1]) # Forward pass

        loss += -xp.log(p_s[t][targets[t], 0]) # Loss for at t


    for dparam in [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y]:
        dparam.fill(0)

    dh_next = xp.zeros_like(h_s[0], dtype=xp.float32) #dh from the next character
    dC_next = xp.zeros_like(C_s[0], dtype=xp.float32) #dh from the next character

    for t in reversed(range(len(inputs))):
        # Backward pass
        dh_next, dC_next = backward(target = targets[t], dh_next = dh_next, dC_next = dC_next, C_prev = C_s[t-1],
                 z = z_s[t], f = f_s[t], i = i_s[t], C_bar = C_bar_s[t], C = C_s[t], o = o_s[t],
                 h = h_s[t], y = y_s[t], p = p_s[t])

    # Clip gradients to mitigate exploding gradients
    for dparam in [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y]:
        xp.clip(dparam, -1, 1, out=dparam)

    return loss, h_s[len(inputs) - 1], C_s[len(inputs) - 1]


# ## Sample the next character

# In[14]:


def sample(h_prev, C_prev, first_char_idx, sentence_length):
    x = xp.zeros((X_size, 1))
    x[first_char_idx] = 1

    h = h_prev
    C = C_prev

    indexes = []
    for t in range(sentence_length):
        _, _, _, _, C, _, h, _, p = forward(x, h, C)
        assert xp.all(p.ravel() >= 0)
        idx = xp.random.choice(xp.arange(X_size), size=(1,), p=p.ravel())[0]
        x = xp.zeros((X_size, 1), dtype=xp.float32)
        x[idx] = 1
        indexes.append(int(idx))

    return indexes


# ## Training (Adagrad)
# 
# $$
# w = w - \eta \frac{dw}{\sum dw^2_{\tau}}
# $$

# In[15]:


def update_status(inputs, h_prev, C_prev):
    #initialized later
    global plot_iter, plot_loss
    global smooth_loss

    # Get predictions for 200 letters with current model
    display.clear_output(wait=True)

    sample_idx = sample(h_prev, C_prev, inputs[0], 200)
    txt = ''.join(idx_to_char[idx] for idx in sample_idx)

    # Clear and plot
    plt.clf()
    plt.plot(plot_iter, plot_loss)
    display.display(plt.gcf())

    #Print prediction and loss
    print("----\n %s \n----" % (txt, ))
    print("iter %d, loss %f" % (iteration, smooth_loss))


# Memory variables for Adagrad

# In[16]:


mW_f = xp.zeros_like(W_f, dtype=xp.float32)
mW_i = xp.zeros_like(W_i, dtype=xp.float32)
mW_C = xp.zeros_like(W_C, dtype=xp.float32)
mW_o = xp.zeros_like(W_o, dtype=xp.float32)
mW_y = xp.zeros_like(W_y, dtype=xp.float32)

mb_f = xp.zeros_like(b_f, dtype=xp.float32)
mb_i = xp.zeros_like(b_i, dtype=xp.float32)
mb_C = xp.zeros_like(b_C, dtype=xp.float32)
mb_o = xp.zeros_like(b_o, dtype=xp.float32)
mb_y = xp.zeros_like(b_y, dtype=xp.float32)


# In[17]:


# Exponential average of loss
# Initialize to a error of a random model
smooth_loss = (-xp.log(1.0 / X_size) * T_steps).astype(xp.float32)

iteration, p = 0, 0

# For the graph
plot_iter = np.zeros((0))
plot_loss = np.zeros((0))


# In[18]:


get_ipython().run_cell_magic('time', '', 'while True:\n    # Try catch for interruption\n    try:\n        # Reset\n        if p + T_steps >= len(data) or iteration == 0:\n            g_h_prev = xp.zeros((H_size, 1), dtype=xp.float32)\n            g_C_prev = xp.zeros((H_size, 1), dtype=xp.float32)\n            p = 0\n\n        inputs = [char_to_idx[ch] for ch in data[p: p + T_steps]]\n        targets = [char_to_idx[ch] for ch in data[p + 1: p + T_steps + 1]]\n\n        loss, g_h_prev, g_C_prev =  forward_backward(inputs, targets, g_h_prev, g_C_prev)\n        smooth_loss = smooth_loss * 0.999 + loss * 0.001\n\n        # Print every hundred steps\n        if iteration % 100 == 0:\n            update_status(inputs, g_h_prev, g_C_prev)\n\n        # Update weights\n        for param, dparam, mem in zip([W_f, W_i, W_C, W_o, W_y, b_f, b_i, b_C, b_o, b_y],\n                                      [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y],\n                                      [mW_f, mW_i, mW_C, mW_o, mW_y, mb_f, mb_i, mb_C, mb_o, mb_y]):\n            mem += dparam * dparam # Calculate sum of gradients\n            #print(learning_rate * dparam)\n            param += -(learning_rate * dparam / xp.sqrt(mem + 1e-8))\n\n        plot_iter = np.append(plot_iter, [iteration])\n        if isinstance(loss, cp.ndarray):\n            loss = loss.get()\n        plot_loss = np.append(plot_loss, [loss])\n\n        p += T_steps\n        iteration += 1\n        if iteration > 6000:\n            break\n    except KeyboardInterrupt:\n        update_status(inputs, g_h_prev, g_C_prev)\n        break\n')


# ## Gradient Check
# 
# Approximate the numerical gradients by changing parameters and running the model. Check if the approximated gradients are equal to the computed analytical gradients (by backpropagation).
# 
# Try this on `num_checks` individual paramters picked randomly for each weight matrix and bias vector.

# In[19]:


from random import uniform


def gradient_check(inputs, target, h_prev, C_prev):
    global W_f, W_i, W_C, W_o, W_y, b_f, b_i, b_C, b_o, b_y
    global dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y

    num_checks = 10 # Number of parameters to test
    delta = 1e-5 # The change to make on the parameter

    # To calculate computed gradients
    _, _, _ =  forward_backward(inputs, targets, h_prev, C_prev)

    for param, dparam, name in zip([W_f, W_i, W_C, W_o, W_y, b_f, b_i, b_C, b_o, b_y],
                                 [dW_f, dW_i, dW_C, dW_o, dW_y, db_f, db_i, db_C, db_o, db_y],
                                 ['W_f', 'W_i', 'W_C', 'W_o', 'W_y', 'b_f', 'b_i', 'b_C', 'b_o', 'b_y']):
        assert param.shape == dparam.shape
        dparam_copy = xp.copy(dparam) #Make a copy because this will get modified

        # Test num_checks times
        for i in range(num_checks):
            # Pick a random index
            rnd_idx = int(uniform(0,param.size))

            # evaluate cost at [x + delta] and [x - delta]
            old_val = xp.ravel(param)[rnd_idx]
            xp.ravel(param)[rnd_idx] = old_val + delta
            loss_plus_delta, _, _ = forward_backward(inputs, targets, h_prev, C_prev)
            xp.ravel(param)[rnd_idx] = old_val - delta
            loss_mins_delta, _, _ = forward_backward(inputs, targets, h_prev, C_prev)
            xp.ravel(param)[rnd_idx] = old_val

            grad_analytical = xp.ravel(dparam_copy)[rnd_idx]
            grad_numerical = (loss_plus_delta - loss_mins_delta) / (2 * delta)
            # Clip numerical error because grad_analytical is clipped
            grad_numerical = xp.clip(grad_numerical, -1, 1)


            err_sum = abs(grad_numerical + grad_analytical) + 1e-09
            rel_error = abs(grad_analytical - grad_numerical) / err_sum

            # If relative error is greater than 1e-06
            if rel_error > 1e-06:
                print('%s (%e, %e) => %e' % (name, grad_numerical, grad_analytical, rel_error))    


# In[20]:


gradient_check(inputs, targets, g_h_prev, g_C_prev)