import tensorflow as tf
import pandas as pd
import numpy as np
import csv
Data is from here https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data, download ner_dataset.csv from the ZIP archive.
validation_sentence = 'While speaking on Channels Television on Thursday April 5 2018 Adesina said the fund is not just to intensify the military fight against Boko Haram but to fight other forms of insecurity in the country'
validation_tags = ['O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-TIM', 'I-TIM', 'I-TIM', 'I-TIM', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Below we parse the file to load sentences and tags into different lists. Also, we only want sentences not more than 35 words long (same length as our validation sentence above)
sentences = []
tags = []
max_length = 50
with open('data/ner_dataset.csv', 'rb') as csvfile:
ner_data = csv.reader(csvfile, delimiter=',')
sentence = []
tag = []
for row in ner_data:
if row[3] == 'Tag':
continue
sentence.append(row[1])
tag.append(row[3].upper())
if row[1] == '.':
if len(sentence) <= max_length:
sentences.append(sentence)
tags.append(tag)
sentence = []
tag = []
Below is sample entries of sentences
and tags
print sentences[:2]
print
print tags[:2]
We'll need to create a vocabulary from our sentences i.e a set of unique words. We'll do same for the tags too
unique_tags = list(set(t for tagset in tags for t in tagset))
vocabulary = list(set(word for sentence in sentences for word in sentence))
print unique_tags
print vocabulary[:10]
print 'Number of words in vocabulary', len(vocabulary)
train_sentences = sentences[:int(.7 * len(sentences))]
train_tags = tags[:int(.7 * len(tags))]
test_sentences = sentences[int(.7 * len(tags) + 1):]
test_tags = tags[int(.7 * len(tags) + 1):]
len(train_sentences), len(test_sentences), len(sentences)
Simple LSTM network with a softmax at the end
Important NOTE: If you want to run the network using a one-hot encoding of the words, make sure batch_size
is set to something low. Higher values might result in your computer freezing. I tried on my core i5, 8GB RAM laptop and it wasn't pleasant. So stick with default value of 8 for batch_size or lower.
# Parameters
learning_rate = 0.001
batch_size = 8
target_size = len(unique_tags)
display_size = 50
# Network Parameters
n_features = len(vocabulary)
sequence_length = 10
n_units = 64
tf.reset_default_graph()
# tf Graph input
X = tf.placeholder('float', [None, max_length, n_features], name='X')
Y = tf.placeholder('float', [None, max_length, target_size], name='Y')
# Define weights
weights = {
'out': tf.Variable(tf.random_normal([n_units, target_size]))
}
biases = {
'out': tf.Variable(tf.random_normal([target_size]))
}
cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)
output, _ = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
output = tf.reshape(output, [-1, n_units])
prediction = tf.matmul(output, weights['out']) + biases['out']
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=Y))
minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)
init = tf.global_variables_initializer()
num_batches = int(len(train_sentences)) / batch_size
epoch = 1
print 'Number of batches:', num_batches
len(train_sentences)
with tf.Session() as sess:
sess.run(init)
for i in range(epoch):
for j in range(num_batches):
ptr = 0
batch_X = []
batch_Y = []
sequence_length = []
for _ in range(batch_size):
x, y = (train_sentences[ptr: ptr + 1],
train_tags[ptr: ptr + 1])
x_one_hot = []
for s in x[0]:
x_one_hot.append(np.eye(len(vocabulary))[vocabulary.index(s)])
sequence_length.append(len(x_one_hot))
for remainder in range(max_length - len(x_one_hot)):
x_one_hot.append([0]*len(vocabulary))
batch_X.append(x_one_hot)
y_one_hot = []
for t in y[0]:
y_one_hot.append(np.eye(target_size)[unique_tags.index(t)])
for remainder in range(max_length - len(y_one_hot)):
y_one_hot.append(np.eye(target_size)[unique_tags.index('O')])
batch_Y.append(y_one_hot)
ptr += 1
_, entropy, preds = sess.run([minimize, cross_entropy, prediction],
{X: np.array(batch_X).reshape(batch_size, max_length, len(vocabulary)),
Y: np.array(batch_Y).reshape(batch_size, max_length, target_size),
sequence_lengths: np.array(sequence_length)})
if j % display_size == 0:
print 'Loss at batch {0}'.format(j), entropy
print "Epoch ",str(i)
We'll use Google's word2vec which you can grab from here https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit.
To load the word embeddings, we'll neeed another tool, gensim
.
from gensim.models import word2vec, KeyedVectors
Load the word vectors like so. This operations takes a good while on my laptop; core i5.
w2v = KeyedVectors.load_word2vec_format('/Users/h/Projects/Machine-Learning/GoogleNews-vectors-negative300.bin.gz', binary=True)
Below is how boy
is represented according to the embedding
w2v.word_vec('boy')
Same as architecture as pervious except n_features
is now the dimension of the vector returned by word2vec
# Parameters
learning_rate = 1e-5
batch_size = 32
target_size = len(unique_tags)
display_size = 50
# Network Parameters
n_features = 300 # dimension of the vector return by word2vec
sequence_length = max_length
n_units = 128
tf.reset_default_graph()
# tf Graph input
X = tf.placeholder('float', [None, max_length, n_features], name='X')
Y = tf.placeholder('int32', [None], name='Y')
sequence_lengths = tf.placeholder('int32', [None])
# Define weights
weights = {
'out': tf.Variable(tf.truncated_normal([2 * n_units, target_size]))
}
biases = {
'out': tf.Variable(tf.constant(0.1, shape=[target_size]))
}
with tf.variable_scope('forward'):
lstm_fw_cell = tf.contrib.rnn.LSTMCell(n_units, state_is_tuple=True)
with tf.variable_scope('backward'):
lstm_bw_cell = tf.contrib.rnn.LSTMCell(n_units, state_is_tuple=True)
(output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell,
cell_bw=lstm_bw_cell,
inputs=X,
sequence_length=sequence_lengths,
dtype=tf.float32)
output = tf.concat([output_fw, output_bw], axis=2)
output = tf.reshape(output, [-1, 2 * n_units])
prediction = tf.matmul(output, weights['out']) + biases['out']
flattened_prediction = tf.reshape(prediction, [-1, target_size])
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=flattened_prediction, labels=Y)
mask = tf.sequence_mask(sequence_lengths)
losses = tf.boolean_mask(tf.reshape(losses, [-1, max_length]), mask)
loss = tf.reduce_mean(losses)
minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
init = tf.global_variables_initializer()
num_batches = int(len(train_sentences)) / batch_size
epoch = 3
print 'Number of batches:', num_batches
with tf.Session() as sess:
sess.run(init)
for i in range(epoch):
ptr = 0
for j in range(num_batches):
batch_X = []
batch_Y = []
sequence_length = []
for _ in range(batch_size):
x, y = (train_sentences[ptr: ptr + 1],
train_tags[ptr: ptr + 1])
x_word_vector = []
sequence_length.append(len(x[0]))
for s in x[0]:
try:
x_word_vector.append(w2v.word_vec(s))
except:
#if word isn't in the word2vec, use zeroes
x_word_vector.append([0]*n_features)
for remainder in range(max_length - len(x_word_vector)):
#pad sentence remainder with zeroes
x_word_vector.append([0]*n_features)
batch_X.append(x_word_vector)
y_word_vector = []
for t in y[0]:
y_word_vector.append(unique_tags.index(t))
for remainder in range(max_length - len(y_word_vector)):
y_word_vector.append(0)
batch_Y.append(y_word_vector)
ptr += 1
_, entropy, preds = sess.run([minimize, loss, prediction],{X: np.array(batch_X).reshape(batch_size, max_length, n_features),
Y: np.array(batch_Y).reshape(-1),
sequence_lengths: np.array(sequence_length)})
if j % display_size == 0:
print 'Loss at batch {0}'.format(j), entropy
print "Epoch ", str(i)
Obvious benefit of using word2vec is that the network runs faster, converges quicker too. Runs faster because we've reduced the feature representation from an outrageous dimension in the length of the vocabulary (thousands) to only 300, the dimension of the array returned by word2vec.
with tf.Session() as sess:
sess.run(init)
valid_X = []
for word in validation_sentence.split(' '):
try:
valid_X.append(w2v.word_vec(word))
except:
#if word isn't in the word2vec, use zeroes
valid_X.append([0]*n_features)
for remainder in range(max_length - len(valid_X)):
#pad sentence remainder with zeroes
valid_X.append([0]*n_features)
valid_Y = []
for t in validation_tags:
valid_Y.append(unique_tags.index(t))
for remainder in range(max_length - len(valid_Y)):
valid_Y.append(0)
preds = sess.run([prediction],{X: np.array(valid_X).reshape(1, max_length, n_features), Y: np.array(valid_Y), sequence_lengths: [35]})
valid_words = validation_sentence.split(' ')
preds = np.array(preds).reshape(max_length, target_size)
for i, p in enumerate(preds):
if i <= 34:
print 'Word:', valid_words[i]
print 'Actual:', validation_tags[i]
print 'Predicted:', unique_tags[np.argmax(p[:34])]
print
More importantly, train on a better dataset. Like I mentioned, NER is domain specific. Our validation sentence contains details perhaps specific to Nigeria:
- the name Adesina and
- Channels Television