Simple example for Many to Many Classification (Simple pos tagger) by LSTM.
tf.data
padding technique
by user function (pad_seq)
tf.nn.embedding_lookup
for getting vector of tokens (eg. word, character)tf.contrib.seq2seq.sequence_loss
tf.sequence_mask
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import string
%matplotlib inline
slim = tf.contrib.slim
print(tf.__version__)
1.8.0
sentences = [['I', 'feel', 'hungry'],
['tensorflow', 'is', 'very', 'difficult'],
['tensorflow', 'is', 'a', 'framework', 'for', 'deep', 'learning'],
['tensorflow', 'is', 'very', 'fast', 'changing']]
pos = [['pronoun', 'verb', 'adjective'],
['noun', 'verb', 'adverb', 'adjective'],
['noun', 'verb', 'determiner', 'noun', 'preposition', 'adjective', 'noun'],
['noun', 'verb', 'adverb', 'adjective', 'verb']]
# word dic
word_list = []
for elm in sentences:
word_list += elm
word_list = list(set(word_list))
word_list.sort()
word_list = ['<pad>'] + word_list
word_dic = {word : idx for idx, word in enumerate(word_list)}
print(word_dic)
{'<pad>': 0, 'I': 1, 'a': 2, 'changing': 3, 'deep': 4, 'difficult': 5, 'fast': 6, 'feel': 7, 'for': 8, 'framework': 9, 'hungry': 10, 'is': 11, 'learning': 12, 'tensorflow': 13, 'very': 14}
# pos dic
pos_list = []
for elm in pos:
pos_list += elm
pos_list = list(set(pos_list))
pos_list.sort()
pos_list = ['<pad>'] + pos_list
print(pos_list)
pos_dic = {pos : idx for idx, pos in enumerate(pos_list)}
pos_dic
['<pad>', 'adjective', 'adverb', 'determiner', 'noun', 'preposition', 'pronoun', 'verb']
{'<pad>': 0, 'adjective': 1, 'adverb': 2, 'determiner': 3, 'noun': 4, 'preposition': 5, 'pronoun': 6, 'verb': 7}
pos_idx_to_dic = {elm[1] : elm[0] for elm in pos_dic.items()}
pos_idx_to_dic
{0: '<pad>', 1: 'adjective', 2: 'adverb', 3: 'determiner', 4: 'noun', 5: 'preposition', 6: 'pronoun', 7: 'verb'}
def pad_seq(sequences, max_len, dic):
seq_len, seq_indices = [], []
for seq in sequences:
seq_len.append(len(seq))
seq_idx = [dic.get(char) for char in seq]
seq_idx += (max_len - len(seq_idx)) * [dic.get('<pad>')] # 0 is idx of meaningless token "<pad>"
seq_indices.append(seq_idx)
return seq_len, seq_indices
max_length = 10
X_length, X_indices = pad_seq(sequences = sentences, max_len = max_length, dic = word_dic)
print(X_length, np.shape(X_indices))
[3, 4, 7, 5] (4, 10)
y = [elm + ['<pad>'] * (max_length - len(elm)) for elm in pos]
y = [list(map(lambda el : pos_dic.get(el), elm)) for elm in y]
print(np.shape(y))
(4, 10)
y
[[6, 7, 1, 0, 0, 0, 0, 0, 0, 0], [4, 7, 2, 1, 0, 0, 0, 0, 0, 0], [4, 7, 3, 4, 5, 1, 4, 0, 0, 0], [4, 7, 2, 1, 7, 0, 0, 0, 0, 0]]
class SimPosLSTM:
def __init__(self, X_length, X_indices, y, n_of_classes, hidden_dim, max_len, word_dic):
# Data pipeline
with tf.variable_scope('input_layer'):
self._X_length = X_length
self._X_indices = X_indices
self._y = y
one_hot = tf.eye(len(word_dic), dtype = tf.float32)
self._one_hot = tf.get_variable(name='one_hot_embedding', initializer = one_hot,
trainable = False) # embedding vector training 안할 것이기 때문
self._X_batch = tf.nn.embedding_lookup(params = self._one_hot, ids = self._X_indices)
# LSTM cell (many to many)
with tf.variable_scope('lstm_cell'):
lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units = hidden_dim,
activation = tf.nn.tanh)
score_cell = tf.contrib.rnn.OutputProjectionWrapper(cell = lstm_cell, output_size = n_of_classes)
self._outputs, _ = tf.nn.dynamic_rnn(cell = score_cell, inputs = self._X_batch,
sequence_length = self._X_length,
dtype = tf.float32)
with tf.variable_scope('seq2seq_loss'):
masks = tf.sequence_mask(lengths = self._X_length, maxlen = max_len, dtype = tf.float32)
self.seq2seq_loss = tf.contrib.seq2seq.sequence_loss(logits = self._outputs, targets = self._y,
weights = masks)
with tf.variable_scope('prediction'):
self._prediction = tf.argmax(input = self._outputs,
axis = 2, output_type = tf.int32)
def predict(self, sess, X_length, X_indices):
feed_prediction = {self._X_length : X_length, self._X_indices : X_indices}
return sess.run(self._prediction, feed_dict = feed_prediction)
# hyper-parameter#
lr = .003
epochs = 100
batch_size = 2
total_step = int(np.shape(X_indices)[0] / batch_size)
print(total_step)
2
## create data pipeline with tf.data
tr_dataset = tf.data.Dataset.from_tensor_slices((X_length, X_indices, y))
tr_dataset = tr_dataset.shuffle(buffer_size = 20)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()
print(tr_dataset)
<BatchDataset shapes: ((?,), (?, 10), (?, 10)), types: (tf.int32, tf.int32, tf.int32)>
X_length_mb, X_indices_mb, y_mb = tr_iterator.get_next()
sim_pos_lstm = SimPosLSTM(X_length = X_length_mb, X_indices = X_indices_mb, y = y_mb,
n_of_classes = 8, hidden_dim = 16, max_len = max_length, word_dic = word_dic)
## create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = sim_pos_lstm.seq2seq_loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
tr_loss_hist = []
for epoch in range(epochs):
avg_tr_loss = 0
tr_step = 0
sess.run(tr_iterator.initializer)
try:
while True:
_, tr_loss = sess.run(fetches = [training_op, sim_pos_lstm.seq2seq_loss])
avg_tr_loss += tr_loss
tr_step += 1
except tf.errors.OutOfRangeError:
pass
avg_tr_loss /= tr_step
tr_loss_hist.append(avg_tr_loss)
if (epoch + 1) % 10 == 0:
print('epoch : {:3}, tr_loss : {:.3f}'.format(epoch + 1, avg_tr_loss))
epoch : 10, tr_loss : 1.943 epoch : 20, tr_loss : 1.743 epoch : 30, tr_loss : 1.473 epoch : 40, tr_loss : 1.163 epoch : 50, tr_loss : 0.826 epoch : 60, tr_loss : 0.569 epoch : 70, tr_loss : 0.413 epoch : 80, tr_loss : 0.300 epoch : 90, tr_loss : 0.212 epoch : 100, tr_loss : 0.160
yhat = sim_pos_lstm.predict(sess = sess, X_length = X_length, X_indices = X_indices)
yhat
array([[6, 7, 1, 0, 0, 0, 0, 0, 0, 0], [4, 7, 2, 1, 0, 0, 0, 0, 0, 0], [4, 7, 3, 4, 5, 1, 4, 0, 0, 0], [4, 7, 2, 1, 7, 0, 0, 0, 0, 0]], dtype=int32)
y
[[6, 7, 1, 0, 0, 0, 0, 0, 0, 0], [4, 7, 2, 1, 0, 0, 0, 0, 0, 0], [4, 7, 3, 4, 5, 1, 4, 0, 0, 0], [4, 7, 2, 1, 7, 0, 0, 0, 0, 0]]
yhat = [list(map(lambda elm : pos_idx_to_dic.get(elm), row)) for row in yhat]
for elm in yhat:
print(elm)
['pronoun', 'verb', 'adjective', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'] ['noun', 'verb', 'adverb', 'adjective', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'] ['noun', 'verb', 'determiner', 'noun', 'preposition', 'adjective', 'noun', '<pad>', '<pad>', '<pad>'] ['noun', 'verb', 'adverb', 'adjective', 'verb', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']