#!/usr/bin/env python
# coding: utf-8

# # Named Entity Recognition (NER) with TensorFlow

# Slides: https://docs.google.com/presentation/d/1eUEOTSeUnR2Sz1uDF4e3YvBaxQ1kLUjog9qkhgBEMV0/edit?usp=sharing

# In[1]:


import tensorflow as tf
import pandas as pd
import numpy as np
import csv


# ### Data Preparation
# 
# Data is from here https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data, download ner_dataset.csv from the ZIP archive.
# 
# 

# In[2]:


validation_sentence = 'While speaking on Channels Television on Thursday April 5 2018 Adesina said the fund is not just to intensify the military fight against Boko Haram but to fight other forms of insecurity in the country'

validation_tags = ['O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-TIM', 'I-TIM', 'I-TIM', 'I-TIM', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                  'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


# Below we parse the file to load sentences and tags into different lists. Also, we only want sentences not more than 35 words long (same length as our validation sentence above)

# In[3]:


sentences = []
tags = []
max_length = 50

with open('data/ner_dataset.csv', 'rb') as csvfile:
    ner_data = csv.reader(csvfile, delimiter=',')
    sentence = []
    tag = []
    for row in ner_data:
        
        if row[3] == 'Tag':
            continue
            
        sentence.append(row[1])
        tag.append(row[3].upper())
        
        if row[1] == '.':
            if len(sentence) <= max_length:
                sentences.append(sentence)
                tags.append(tag)
            sentence = []
            tag = []


# Below is sample entries of `sentences` and `tags`

# In[4]:


print sentences[:2]
print
print tags[:2]


# We'll need to create a vocabulary from our sentences i.e a set of unique words. We'll do same for the tags too

# In[54]:


unique_tags = list(set(t for tagset in tags for t in tagset))
vocabulary = list(set(word for sentence in sentences for word in sentence))


# In[48]:


print unique_tags


# In[7]:


print vocabulary[:10]
print 'Number of words in vocabulary', len(vocabulary)


# In[8]:


train_sentences = sentences[:int(.7 * len(sentences))]
train_tags = tags[:int(.7 * len(tags))]

test_sentences = sentences[int(.7 * len(tags) + 1):]
test_tags = tags[int(.7 * len(tags) + 1):]


# In[9]:


len(train_sentences), len(test_sentences), len(sentences)


# ### Model Architecture
# Simple LSTM network with a softmax at the end
# 
# Important NOTE: If you want to run the network using a one-hot encoding of the words, make sure `batch_size` is set to something low. Higher values might result in your computer freezing. I tried on my core i5, 8GB RAM laptop and it wasn't pleasant. So stick with default value of 8 for batch_size or lower.

# In[10]:


# Parameters
learning_rate = 0.001
batch_size = 8
target_size = len(unique_tags)
display_size = 50

# Network Parameters
n_features = len(vocabulary)
sequence_length = 10
n_units = 64

tf.reset_default_graph()

# tf Graph input
X = tf.placeholder('float', [None, max_length, n_features], name='X')
Y = tf.placeholder('float', [None, max_length, target_size], name='Y')

# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([n_units, target_size]))
}
biases = {
    'out': tf.Variable(tf.random_normal([target_size]))
}


# In[11]:


cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)

output, _ = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

output = tf.reshape(output, [-1, n_units])

prediction = tf.matmul(output, weights['out']) + biases['out']
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=Y))

minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)


# In[12]:


init = tf.global_variables_initializer()
num_batches = int(len(train_sentences)) / batch_size
epoch = 1
print 'Number of batches:', num_batches


# In[13]:


len(train_sentences)


# ### Run graph using one-hot encoding of words

# In[39]:


with tf.Session() as sess:
    sess.run(init)
    for i in range(epoch):        
        
        for j in range(num_batches):
            ptr = 0
            batch_X = []
            batch_Y = []
            sequence_length = []
            for _ in range(batch_size):
                x, y = (train_sentences[ptr: ptr + 1], 
                        train_tags[ptr: ptr + 1])            

                x_one_hot = []

                for s in x[0]:
                    x_one_hot.append(np.eye(len(vocabulary))[vocabulary.index(s)])
                
                sequence_length.append(len(x_one_hot))
                
                for remainder in range(max_length - len(x_one_hot)):
                    x_one_hot.append([0]*len(vocabulary))
                    
                batch_X.append(x_one_hot)              

                y_one_hot = []

                for t in y[0]:
                    y_one_hot.append(np.eye(target_size)[unique_tags.index(t)])
                    
                for remainder in range(max_length - len(y_one_hot)):
                    y_one_hot.append(np.eye(target_size)[unique_tags.index('O')])
                    
                batch_Y.append(y_one_hot)

                ptr += 1
            
            _, entropy, preds = sess.run([minimize, cross_entropy, prediction], 
                                         {X: np.array(batch_X).reshape(batch_size, max_length, len(vocabulary)), 
                                          Y: np.array(batch_Y).reshape(batch_size, max_length, target_size),
                                          sequence_lengths: np.array(sequence_length)})
            
            if j % display_size == 0:
                print 'Loss at batch {0}'.format(j), entropy

        print "Epoch ",str(i)
    

# ### Word Embeddings
# We'll use Google's word2vec which you can grab from here https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit.
# To load the word embeddings, we'll neeed another tool, `gensim`.

# In[15]:


from gensim.models import word2vec, KeyedVectors


# Load the word vectors like so. This operations takes a good while on my laptop; core i5.

# In[16]:


w2v = KeyedVectors.load_word2vec_format('/Users/h/Projects/Machine-Learning/GoogleNews-vectors-negative300.bin.gz', binary=True)


# Below is how `boy` is represented according to the embedding

# In[17]:


w2v.word_vec('boy')


# ### Run graph with words represented as word2vec
# Same as architecture as pervious except `n_features` is now the dimension of the vector returned by word2vec

# In[151]:


# Parameters
learning_rate = 1e-5
batch_size = 32
target_size = len(unique_tags)
display_size = 50

# Network Parameters
n_features = 300 # dimension of the vector return by word2vec
sequence_length = max_length
n_units = 128

tf.reset_default_graph()

# tf Graph input
X = tf.placeholder('float', [None, max_length, n_features], name='X')
Y = tf.placeholder('int32', [None], name='Y')
sequence_lengths = tf.placeholder('int32', [None])

# Define weights
weights = {
    'out': tf.Variable(tf.truncated_normal([2 * n_units, target_size]))
}
biases = {
    'out': tf.Variable(tf.constant(0.1, shape=[target_size]))
}


# In[152]:


with tf.variable_scope('forward'):
    lstm_fw_cell = tf.contrib.rnn.LSTMCell(n_units, state_is_tuple=True)

with tf.variable_scope('backward'):
    lstm_bw_cell = tf.contrib.rnn.LSTMCell(n_units, state_is_tuple=True)

(output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, 
                                                                     cell_bw=lstm_bw_cell, 
                                                                     inputs=X, 
                                                                     sequence_length=sequence_lengths,
                                                                     dtype=tf.float32)
output = tf.concat([output_fw, output_bw], axis=2)
output = tf.reshape(output, [-1, 2 * n_units])
prediction = tf.matmul(output, weights['out']) + biases['out']
flattened_prediction = tf.reshape(prediction, [-1, target_size])
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=flattened_prediction, labels=Y)
mask = tf.sequence_mask(sequence_lengths)
losses = tf.boolean_mask(tf.reshape(losses, [-1, max_length]), mask)
loss = tf.reduce_mean(losses)

minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)


# In[153]:


init = tf.global_variables_initializer()
num_batches = int(len(train_sentences)) / batch_size
epoch = 3
print 'Number of batches:', num_batches


# In[ ]:


with tf.Session() as sess:
    sess.run(init)
    for i in range(epoch):        
        ptr = 0
        for j in range(num_batches):            
            batch_X = []
            batch_Y = []
            sequence_length = []
            for _ in range(batch_size):
                x, y = (train_sentences[ptr: ptr + 1], 
                        train_tags[ptr: ptr + 1])            

                x_word_vector = []
                
                sequence_length.append(len(x[0]))
                
                for s in x[0]:
                    try:
                        x_word_vector.append(w2v.word_vec(s))
                    except:
                        #if word isn't in the word2vec, use zeroes
                        x_word_vector.append([0]*n_features)                                
                
                for remainder in range(max_length - len(x_word_vector)):
                    #pad sentence remainder with zeroes
                    x_word_vector.append([0]*n_features)
                    
                batch_X.append(x_word_vector)              

                y_word_vector = []

                for t in y[0]:
                    y_word_vector.append(unique_tags.index(t))
                    
                for remainder in range(max_length - len(y_word_vector)):
                    y_word_vector.append(0)
                
                batch_Y.append(y_word_vector)

                ptr += 1
            
            _, entropy, preds = sess.run([minimize, loss, prediction],{X: np.array(batch_X).reshape(batch_size, max_length, n_features), 
                                                                       Y: np.array(batch_Y).reshape(-1), 
                                                                       sequence_lengths: np.array(sequence_length)})

            
            if j % display_size == 0:
                print 'Loss at batch {0}'.format(j), entropy

        print "Epoch ", str(i)
    

# Obvious benefit of using word2vec is that the network runs faster, converges quicker too. Runs faster because we've reduced the feature representation from an outrageous dimension in the length of the vocabulary (thousands) to only 300, the dimension of the array returned by word2vec.

# ### Prediction

# In[ ]:


with tf.Session() as sess:
    sess.run(init)

    valid_X = []
    
    for word in validation_sentence.split(' '):
        try:
            valid_X.append(w2v.word_vec(word))
        except:
            #if word isn't in the word2vec, use zeroes
            valid_X.append([0]*n_features)
            
    for remainder in range(max_length - len(valid_X)):
        #pad sentence remainder with zeroes
        valid_X.append([0]*n_features)

    valid_Y = []

    for t in validation_tags:
        valid_Y.append(unique_tags.index(t))
        
    for remainder in range(max_length - len(valid_Y)):
        valid_Y.append(0)
            
    preds = sess.run([prediction],{X: np.array(valid_X).reshape(1, max_length, n_features), Y: np.array(valid_Y), sequence_lengths: [35]})

    valid_words = validation_sentence.split(' ')
    preds = np.array(preds).reshape(max_length, target_size)

    for i, p in enumerate(preds):
        if i <= 34:
            print 'Word:', valid_words[i]
            print 'Actual:', validation_tags[i]
            print 'Predicted:', unique_tags[np.argmax(p[:34])]
            print        


# ### Things to try
# - Add dropout
# - Replace softmax with Linear-Chain CRF
# - Try other word representations; Glove?
# - Tune batch size, learning rate, number of units in LSTM cell, number of epoch
# - Try GRU unit
# - Add MOAR layers!!!
# - Use longer sentences
# 
# 
# More importantly, train on a better dataset. Like I mentioned, NER is domain specific. Our validation sentence contains details perhaps specific to Nigeria:
#     - the name Adesina and
#     - Channels Television

# # Resources
# - Sequence Tagging with Tensorflow https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html