#!/usr/bin/env python # coding: utf-8 # # Named Entity Recognition (NER) with TensorFlow # Slides: https://docs.google.com/presentation/d/1eUEOTSeUnR2Sz1uDF4e3YvBaxQ1kLUjog9qkhgBEMV0/edit?usp=sharing # In[1]: import tensorflow as tf import pandas as pd import numpy as np import csv # ### Data Preparation # # Data is from here https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data, download ner_dataset.csv from the ZIP archive. # # # In[2]: validation_sentence = 'While speaking on Channels Television on Thursday April 5 2018 Adesina said the fund is not just to intensify the military fight against Boko Haram but to fight other forms of insecurity in the country' validation_tags = ['O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-TIM', 'I-TIM', 'I-TIM', 'I-TIM', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] # Below we parse the file to load sentences and tags into different lists. Also, we only want sentences not more than 35 words long (same length as our validation sentence above) # In[3]: sentences = [] tags = [] max_length = 50 with open('data/ner_dataset.csv', 'rb') as csvfile: ner_data = csv.reader(csvfile, delimiter=',') sentence = [] tag = [] for row in ner_data: if row[3] == 'Tag': continue sentence.append(row[1]) tag.append(row[3].upper()) if row[1] == '.': if len(sentence) <= max_length: sentences.append(sentence) tags.append(tag) sentence = [] tag = [] # Below is sample entries of `sentences` and `tags` # In[4]: print sentences[:2] print print tags[:2] # We'll need to create a vocabulary from our sentences i.e a set of unique words. We'll do same for the tags too # In[54]: unique_tags = list(set(t for tagset in tags for t in tagset)) vocabulary = list(set(word for sentence in sentences for word in sentence)) # In[48]: print unique_tags # In[7]: print vocabulary[:10] print 'Number of words in vocabulary', len(vocabulary) # In[8]: train_sentences = sentences[:int(.7 * len(sentences))] train_tags = tags[:int(.7 * len(tags))] test_sentences = sentences[int(.7 * len(tags) + 1):] test_tags = tags[int(.7 * len(tags) + 1):] # In[9]: len(train_sentences), len(test_sentences), len(sentences) # ### Model Architecture # Simple LSTM network with a softmax at the end # # Important NOTE: If you want to run the network using a one-hot encoding of the words, make sure `batch_size` is set to something low. Higher values might result in your computer freezing. I tried on my core i5, 8GB RAM laptop and it wasn't pleasant. So stick with default value of 8 for batch_size or lower. # In[10]: # Parameters learning_rate = 0.001 batch_size = 8 target_size = len(unique_tags) display_size = 50 # Network Parameters n_features = len(vocabulary) sequence_length = 10 n_units = 64 tf.reset_default_graph() # tf Graph input X = tf.placeholder('float', [None, max_length, n_features], name='X') Y = tf.placeholder('float', [None, max_length, target_size], name='Y') # Define weights weights = { 'out': tf.Variable(tf.random_normal([n_units, target_size])) } biases = { 'out': tf.Variable(tf.random_normal([target_size])) } # In[11]: cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True) output, _ = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) output = tf.reshape(output, [-1, n_units]) prediction = tf.matmul(output, weights['out']) + biases['out'] cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=Y)) minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy) # In[12]: init = tf.global_variables_initializer() num_batches = int(len(train_sentences)) / batch_size epoch = 1 print 'Number of batches:', num_batches # In[13]: len(train_sentences) # ### Run graph using one-hot encoding of words # In[39]: with tf.Session() as sess: sess.run(init) for i in range(epoch): for j in range(num_batches): ptr = 0 batch_X = [] batch_Y = [] sequence_length = [] for _ in range(batch_size): x, y = (train_sentences[ptr: ptr + 1], train_tags[ptr: ptr + 1]) x_one_hot = [] for s in x[0]: x_one_hot.append(np.eye(len(vocabulary))[vocabulary.index(s)]) sequence_length.append(len(x_one_hot)) for remainder in range(max_length - len(x_one_hot)): x_one_hot.append([0]*len(vocabulary)) batch_X.append(x_one_hot) y_one_hot = [] for t in y[0]: y_one_hot.append(np.eye(target_size)[unique_tags.index(t)]) for remainder in range(max_length - len(y_one_hot)): y_one_hot.append(np.eye(target_size)[unique_tags.index('O')]) batch_Y.append(y_one_hot) ptr += 1 _, entropy, preds = sess.run([minimize, cross_entropy, prediction], {X: np.array(batch_X).reshape(batch_size, max_length, len(vocabulary)), Y: np.array(batch_Y).reshape(batch_size, max_length, target_size), sequence_lengths: np.array(sequence_length)}) if j % display_size == 0: print 'Loss at batch {0}'.format(j), entropy print "Epoch ",str(i) # ### Word Embeddings # We'll use Google's word2vec which you can grab from here https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit. # To load the word embeddings, we'll neeed another tool, `gensim`. # In[15]: from gensim.models import word2vec, KeyedVectors # Load the word vectors like so. This operations takes a good while on my laptop; core i5. # In[16]: w2v = KeyedVectors.load_word2vec_format('/Users/h/Projects/Machine-Learning/GoogleNews-vectors-negative300.bin.gz', binary=True) # Below is how `boy` is represented according to the embedding # In[17]: w2v.word_vec('boy') # ### Run graph with words represented as word2vec # Same as architecture as pervious except `n_features` is now the dimension of the vector returned by word2vec # In[151]: # Parameters learning_rate = 1e-5 batch_size = 32 target_size = len(unique_tags) display_size = 50 # Network Parameters n_features = 300 # dimension of the vector return by word2vec sequence_length = max_length n_units = 128 tf.reset_default_graph() # tf Graph input X = tf.placeholder('float', [None, max_length, n_features], name='X') Y = tf.placeholder('int32', [None], name='Y') sequence_lengths = tf.placeholder('int32', [None]) # Define weights weights = { 'out': tf.Variable(tf.truncated_normal([2 * n_units, target_size])) } biases = { 'out': tf.Variable(tf.constant(0.1, shape=[target_size])) } # In[152]: with tf.variable_scope('forward'): lstm_fw_cell = tf.contrib.rnn.LSTMCell(n_units, state_is_tuple=True) with tf.variable_scope('backward'): lstm_bw_cell = tf.contrib.rnn.LSTMCell(n_units, state_is_tuple=True) (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, cell_bw=lstm_bw_cell, inputs=X, sequence_length=sequence_lengths, dtype=tf.float32) output = tf.concat([output_fw, output_bw], axis=2) output = tf.reshape(output, [-1, 2 * n_units]) prediction = tf.matmul(output, weights['out']) + biases['out'] flattened_prediction = tf.reshape(prediction, [-1, target_size]) losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=flattened_prediction, labels=Y) mask = tf.sequence_mask(sequence_lengths) losses = tf.boolean_mask(tf.reshape(losses, [-1, max_length]), mask) loss = tf.reduce_mean(losses) minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) # In[153]: init = tf.global_variables_initializer() num_batches = int(len(train_sentences)) / batch_size epoch = 3 print 'Number of batches:', num_batches # In[ ]: with tf.Session() as sess: sess.run(init) for i in range(epoch): ptr = 0 for j in range(num_batches): batch_X = [] batch_Y = [] sequence_length = [] for _ in range(batch_size): x, y = (train_sentences[ptr: ptr + 1], train_tags[ptr: ptr + 1]) x_word_vector = [] sequence_length.append(len(x[0])) for s in x[0]: try: x_word_vector.append(w2v.word_vec(s)) except: #if word isn't in the word2vec, use zeroes x_word_vector.append([0]*n_features) for remainder in range(max_length - len(x_word_vector)): #pad sentence remainder with zeroes x_word_vector.append([0]*n_features) batch_X.append(x_word_vector) y_word_vector = [] for t in y[0]: y_word_vector.append(unique_tags.index(t)) for remainder in range(max_length - len(y_word_vector)): y_word_vector.append(0) batch_Y.append(y_word_vector) ptr += 1 _, entropy, preds = sess.run([minimize, loss, prediction],{X: np.array(batch_X).reshape(batch_size, max_length, n_features), Y: np.array(batch_Y).reshape(-1), sequence_lengths: np.array(sequence_length)}) if j % display_size == 0: print 'Loss at batch {0}'.format(j), entropy print "Epoch ", str(i) # Obvious benefit of using word2vec is that the network runs faster, converges quicker too. Runs faster because we've reduced the feature representation from an outrageous dimension in the length of the vocabulary (thousands) to only 300, the dimension of the array returned by word2vec. # ### Prediction # In[ ]: with tf.Session() as sess: sess.run(init) valid_X = [] for word in validation_sentence.split(' '): try: valid_X.append(w2v.word_vec(word)) except: #if word isn't in the word2vec, use zeroes valid_X.append([0]*n_features) for remainder in range(max_length - len(valid_X)): #pad sentence remainder with zeroes valid_X.append([0]*n_features) valid_Y = [] for t in validation_tags: valid_Y.append(unique_tags.index(t)) for remainder in range(max_length - len(valid_Y)): valid_Y.append(0) preds = sess.run([prediction],{X: np.array(valid_X).reshape(1, max_length, n_features), Y: np.array(valid_Y), sequence_lengths: [35]}) valid_words = validation_sentence.split(' ') preds = np.array(preds).reshape(max_length, target_size) for i, p in enumerate(preds): if i <= 34: print 'Word:', valid_words[i] print 'Actual:', validation_tags[i] print 'Predicted:', unique_tags[np.argmax(p[:34])] print # ### Things to try # - Add dropout # - Replace softmax with Linear-Chain CRF # - Try other word representations; Glove? # - Tune batch size, learning rate, number of units in LSTM cell, number of epoch # - Try GRU unit # - Add MOAR layers!!! # - Use longer sentences # # # More importantly, train on a better dataset. Like I mentioned, NER is domain specific. Our validation sentence contains details perhaps specific to Nigeria: # - the name Adesina and # - Channels Television # # Resources # - Sequence Tagging with Tensorflow https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html