#!/usr/bin/env python
# coding: utf-8
# # Data Driven Modeling
# ### (Theme of this semester: CODING AS LITERACY)
#
# ### PhD seminar series at Chair for Computer Aided Architectural Design (CAAD), ETH Zurich
# [Vahid Moosavi](https://vahidmoosavi.com/)
#
#
#
# # 16th Session
#
# 25 April 2017
#
# # Introduction to Recurrent Neural Nets
# ### To be discussed
# * ** Key Concepts**
# * **Simple Experiments**
# * **Interesting applications**
# In[5]:
import warnings
warnings.filterwarnings("ignore")
import datetime
import pandas as pd
# import pandas.io.data
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import sys
import sompylib.sompy as SOM# from pandas import Series, DataFrame
from ipywidgets import interact, HTML, FloatSlider
get_ipython().run_line_magic('matplotlib', 'inline')
# # Feed Forward Networks
# - **Assumption of independent data**
#
# ![](Images/feedforward_rumelhart.png)
#
#
#
#
# # Recurrent Neural Networks
# - **When our decision depends on our current state**
# - ** Similar to (Hidden) Markov Chains, but not exactly the smae attitude: Here we don't build a global state model**
#
# ![](Images/srn_elman.png)
#
# # Unrolled RNN
# - **To linearize in time: Multiple copies of the same cell**
#
# ![](Images/RNN-unrolled.png)
# source: http://colah.github.io/posts/2015-08-Understa/nding-LSTMs/
#
#
#
# ## Basic self referential element of RNN cell
# # $$ h_t = \tanh ( W_{hh} h_{t-1} + W_{xh} x_t )$$
# In[6]:
# http://karpathy.github.io/2015/05/21/rnn-effectiveness/
class RNN:
def setup(self,hidden_size,input_size):
Wxh = np.random.randn(hidden_size, input_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(input_size, hidden_size)*0.01 # hidden to output
self.W_hh = Whh
self.h = np.zeros((hidden_size))
self.W_xh = Wxh
self.W_hy = Why
def step(self, x):
# update the hidden state
self.h = np.tanh(np.dot(self.h,self.W_hh) + np.dot(self.W_xh, x)).T
# print self.h.shape
# compute the output vector
y = np.dot(self.W_hy, self.h)
# print self.W_hy.shape,self.h.shape , y.shape
return y
# In[7]:
rnn1= RNN()
rnn2 = RNN()
hidden_size = 10
input_size = 4
rnn1.setup(hidden_size,input_size)
rnn2.setup(hidden_size,input_size)
# In[8]:
# In every step hidden states get updated
#two time steps
x = np.asarray([1,0,0,1])
y1 = rnn1.step(x)
y2 = rnn2.step(y1)
print x, y2
x = y2
y1 = rnn1.step(x)
y2 = rnn2.step(y1)
print x, y2
# ## Then, we need to update the matrices via gradient descent
# ### Simple examples at the end
#
#
# # However, for a long time nobody (few) believed in RNNs
# - ** Memory problem**
# - **Vanishing and Exploding Gradients**
#
#
#
# # Long Short-Term Memory (LSTM)
# - **Learn what and where to remember and forget!**
# - **[A nice lecture on LSTM](https://www.youtube.com/watch?v=56TYLaQN4N8)**
# - **[A nice blog post](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)**
# ![](Images/lstm_memorycell.png)
# image from: http://deeplearning.net/tutorial/lstm.html
#
#
# ## Deep RNN
# - ** Ecah cell unfolds horizontally**
# - ** We stack cells vertically**
#
# ![](Images/deep_RNN.png)
#
# ## Different Architectures
# ![](Images/RNN_Archs.jpeg)
#
#
# # Important concepts: In princple RNNs can learn any "program" via data
#
# # Some examples using TensorFlow
# ## number of 1s in a binary number
# - **Architecture**: We are just interested in the last output (Many to one)
# In[9]:
#Source code with the blog post at http://monik.in/a-noobs-guide-to-implementing-rnn-lstm-using-tensorflow/
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import random
from random import shuffle
import tensorflow as tf
tf.reset_default_graph()
pow2= 16
train_input = ['{0:016b}'.format(i) for i in range(2**pow2)]
shuffle(train_input)
train_input = [map(int,i) for i in train_input]
ti = []
for i in train_input:
temp_list = []
for j in i:
temp_list.append([j])
ti.append(np.array(temp_list))
train_input = ti
train_output = []
for i in train_input:
count = 0
for j in i:
if j[0] == 1:
count+=1
temp_list = ([0]*(pow2+1))
temp_list[count]=1
train_output.append(temp_list)
# In[10]:
print train_input[0]
print train_output[0]
# In[11]:
NUM_EXAMPLES = 10000
test_input = train_input[NUM_EXAMPLES:]
test_output = train_output[NUM_EXAMPLES:]
train_input = train_input[:NUM_EXAMPLES]
train_output = train_output[:NUM_EXAMPLES]
tf.reset_default_graph()
print "test and training data loaded"
data = tf.placeholder(tf.float32, [None, pow2,1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, (pow2+1)])
num_hidden = 24
num_layers=2
cell = tf.nn.rnn_cell.LSTMCell(num_hidden,state_is_tuple=True)
cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
val, _ = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)
weight = tf.Variable(tf.truncated_normal([num_hidden, int(target.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
cross_entropy = -tf.reduce_sum(target * tf.log(tf.clip_by_value(prediction,1e-10,1.0)))
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cross_entropy)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
# init_op = tf.initialize_all_variables()
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
batch_size = 200
no_of_batches = int(len(train_input)) / batch_size
epoch = 200
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out})
if i%100 ==0:
incorrect = sess.run(error,{data: inp, target: out})
print "Epoch {} error: {}".format(i,incorrect*100)
incorrect = sess.run(error,{data: test_input, target: test_output})
print('Epoch {:2d} error {:3.1f}%'.format(i + 1, 100 * incorrect))
# In[12]:
tt = np.random.randint(0,2,size=pow2)[np.newaxis,:,np.newaxis]
p = sess.run(tf.argmax(prediction, 1),{data:tt})
print np.sum(tt),p[0]
# In[13]:
sess.close()
# ## Summing Two numbers
# - **Architecture**: We are interested in all the outputs in time (many to many)
# In[14]:
# https://gist.github.com/nivwusquorum/b18ce332bde37e156034e5d3f60f8a23
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.layers as layers
tf.reset_default_graph()
# map_fn = tf.python.functional_ops.map_fn
map_fn = tf.map_fn
################################################################################
## DATASET GENERATION ##
## ##
## The problem we are trying to solve is adding two binary numbers. The ##
## numbers are reversed, so that the state of RNN can add the numbers ##
## perfectly provided it can learn to store carry in the state. Timestep t ##
## corresponds to bit len(number) - t. ##
################################################################################
def as_bytes(num, final_size):
res = []
for _ in range(final_size):
res.append(num % 2)
num //= 2
return res
def generate_example(num_bits):
a = random.randint(0, 2**(num_bits - 1) - 1)
b = random.randint(0, 2**(num_bits - 1) - 1)
res = a + b
return (as_bytes(a, num_bits),
as_bytes(b, num_bits),
as_bytes(res,num_bits))
def generate_batch(num_bits, batch_size):
"""Generates instance of a problem.
Returns
-------
x: np.array
two numbers to be added represented by bits.
shape: b, i, n
where:
b is bit index from the end
i is example idx in batch
n is one of [0,1] depending for first and
second summand respectively
y: np.array
the result of the addition
shape: b, i, n
where:
b is bit index from the end
i is example idx in batch
n is always 0
"""
x = np.empty((num_bits, batch_size, 2))
y = np.empty((num_bits, batch_size, 1))
for i in range(batch_size):
a, b, r = generate_example(num_bits)
x[:, i, 0] = a
x[:, i, 1] = b
y[:, i, 0] = r
return x, y
################################################################################
## GRAPH DEFINITION ##
################################################################################
INPUT_SIZE = 2 # 2 bits per timestep
RNN_HIDDEN = 20
OUTPUT_SIZE = 1 # 1 bit per timestep
TINY = 1e-6 # to avoid NaNs in logs
LEARNING_RATE = 0.01
USE_LSTM = True
inputs = tf.placeholder(tf.float32, (None, None, INPUT_SIZE)) # (time, batch, in)
outputs = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE)) # (time, batch, out)
if USE_LSTM:
num_layers=2
cell = tf.nn.rnn_cell.BasicLSTMCell(RNN_HIDDEN, state_is_tuple=True)
cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
else:
cell = tf.nn.rnn_cell.BasicRNNCell(RNN_HIDDEN)
# Create initial state. Here it is just a constant tensor filled with zeros,
# but in principle it could be a learnable parameter. This is a bit tricky
# to do for LSTM's tuple state, but can be achieved by creating two vector
# Variables, which are then tiled along batch dimension and grouped into tuple.
batch_size = tf.shape(inputs)[1]
initial_state = cell.zero_state(batch_size, tf.float32)
# Given inputs (time, batch, input_size) outputs a tuple
# - outputs: (time, batch, output_size) [do not mistake with OUTPUT_SIZE]
# - states: (time, batch, hidden_size)
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, time_major=True)
# project output from rnn output size to OUTPUT_SIZE. Sometimes it is worth adding
# an extra layer here.
final_projection = lambda x: layers.linear(x, num_outputs=OUTPUT_SIZE, activation_fn=tf.nn.sigmoid)
# apply projection to every timestep.
predicted_outputs = map_fn(final_projection, rnn_outputs)
# compute elementwise cross entropy.
error = -(outputs * tf.log(predicted_outputs + TINY) + (1.0 - outputs) * tf.log(1.0 - predicted_outputs + TINY))
error = tf.reduce_mean(error)
# optimize
train_fn = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(error)
# assuming that absolute difference between output and correct answer is 0.5
# or less we can round it to the correct output.
accuracy = tf.reduce_mean(tf.cast(tf.abs(outputs - predicted_outputs) < 0.5, tf.float32))
################################################################################
## TRAINING LOOP ##
################################################################################
NUM_BITS = 10
ITERATIONS_PER_EPOCH = 100
BATCH_SIZE = 16
valid_x, valid_y = generate_batch(num_bits=NUM_BITS, batch_size=100)
session = tf.Session()
# For some reason it is our job to do this:
session.run(tf.global_variables_initializer())
for epoch in range(200):
epoch_error = 0
for _ in range(ITERATIONS_PER_EPOCH):
# here train_fn is what triggers backprop. error and accuracy on their
# own do not trigger the backprop.
x, y = generate_batch(num_bits=NUM_BITS, batch_size=BATCH_SIZE)
epoch_error += session.run([error, train_fn], {
inputs: x,
outputs: y,
})[0]
epoch_error /= ITERATIONS_PER_EPOCH
valid_accuracy = session.run(accuracy, {
inputs: valid_x,
outputs: valid_y,
})
if epoch%10==0:
print ("Epoch %d, train error: %.2f, valid accuracy: %.1f %%" % (epoch, epoch_error, valid_accuracy * 100.0))
#
# In[15]:
preds_valid = session.run(predicted_outputs, {
inputs: valid_x,
outputs: valid_y,
})
# In[16]:
i = 20
print (np.around(preds_valid)[:,i,0])
print (valid_y[:,i,0])
# In[17]:
session.close()
# ## MNIST RNN (Image Classification)
#
# - **Architecture**:
# - We have sequences of rows (columns) of images. (many to one)
# - We are interested to predict a class only at the end
# In[18]:
# From: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/notebooks/3_NeuralNetworks/recurrent_network.ipynb
'''
A Recurrent Neural Network (LSTM) implementation example using TensorFlow library.
This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/)
Long Short Term Memory paper: http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
'''
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# ''' To classify images using a reccurent neural network, we consider every image row as a sequence of pixels. Because MNIST image shape is 28*28px, we will then handle 28 sequences of 28 steps for every sample. '''
# In[19]:
# Parameters
learning_rate = 0.001
training_iters = 100000
batch_size = 128
display_step = 10
tf.reset_default_graph()
# Network Parameters
n_input = 28 # MNIST data input (img shape: 28*28)
n_steps = 28 # timesteps
n_hidden = 128 # hidden layer num of features
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
# data = tf.placeholder(tf.float32, [None, pow2,1]) #Number of examples, number of input, dimension of each input
# Define weights
weights = {
'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
}
biases = {
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Prepare data shape to match `rnn` function requirements
num_layers = 2
lstm_cell = tf.nn.rnn_cell.LSTMCell(n_hidden,state_is_tuple=True)
lstm_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers, state_is_tuple=True)
val, _ = tf.nn.dynamic_rnn(lstm_cell, x, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)
pred = tf.matmul(last, weights['out']) + biases['out']
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Launch the graph
sess1 = tf.Session()
# For some reason it is our job to do this:
sess1.run(tf.global_variables_initializer())
# with tf.Session() as sess:
# sess.run(init)
step = 1
# Keep training until reach max iterations
while step * batch_size < training_iters:
batch_x, batch_y = mnist.train.next_batch(batch_size)
# Reshape data to get 28 seq of 28 elements
batch_x = batch_x.reshape((batch_size, n_steps, n_input))
# Run optimization op (backprop)
sess1.run(optimizer, feed_dict={x: batch_x, y: batch_y})
if step % display_step == 0:
# Calculate batch accuracy
acc = sess1.run(accuracy, feed_dict={x: batch_x, y: batch_y})
# Calculate batch loss
loss = sess1.run(cost, feed_dict={x: batch_x, y: batch_y})
print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
step += 1
print("Optimization Finished!")
# Calculate accuracy for 128 mnist test images
test_len = 128
test_data = mnist.test.images[:test_len].reshape((-1, n_steps, n_input))
test_label = mnist.test.labels[:test_len]
print("Testing Accuracy:", \
sess1.run(accuracy, feed_dict={x: test_data, y: test_label}))
# ## Next steps hopefully by the next week
#
# - **Try this! Polygon example: Learn to draw convex polygons as a sequence of two dimensional points**
# - **Sentiment analysis**
# - **Text and time series**
# - **Multi-dimensional time series prediction**
# # Other applications!
# - handwriting generation
# - https://www.cs.toronto.edu/~graves/handwriting.cgi?text=Vahid+Moosavi&style=&bias=0.15&samples=3
# - https://nbviewer.jupyter.org/github/greydanus/scribe/blob/master/sample.ipynb
# - http://greydanus.github.io/2016/08/21/handwriting/
# - Image captioning
# - http://cs.stanford.edu/people/karpathy/deepimagesent/
# - Seq2Seq models for natural language translation
#
# - Learning and Translation in general
# - http://binds.cs.umass.edu/papers/1995_Siegelmann_Science.pdf
#
# - Extensions to LSTM: Neural Turing Machine and differentiable-neural-computers
# - https://arxiv.org/abs/1410.5401
# - https://deepmind.com/blog/differentiable-neural-computers/
#
#
# ## Two useful sources on RNN
# - http://www.deeplearningbook.org/contents/rnn.html
# - Awsome RNN: https://github.com/kjw0612/awesome-rnn