In this notebook I tried to explore the basic sequence model i.e using RNN(and its variants) to build a N:1 sequence model. Here N:1 means, the model takes a sequence (a natural language text) as input and makes a prediction based on the training data. In NLP literature these models are called RNN acceptors but Andrej karpathy made the notion of calling RNN's as N:1 (or N:N or or 1:N) in his famous blog post, The unreasonable effectiveness of RNNs
This is tested on tensorflow-gpu=1.13.1
train_data_raw.head()
import tensorflow as tf
from keras import backend
import logging
import numpy as np
import pandas as pd
import random
pd.set_option('display.max_colwidth', -1)
#To have reproducability: Set all the seeds, make sure multithreading is off, if possible don't use GPU.
tf.set_random_seed(7)
np.random.seed(7)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))
# Synthetic Training and Test data
def generate_data(hour, minute, sentence=''):
special = [15,30]
suffix = ""
#print(hour, minute)
dictionary = {1:"one", 2:"two", 3:"three", 4:"four", 5:"five", 6:"six", 7:"seven", 8:"eight", 9:"nine", 10:"ten", 11:"eleven", 12:"twelve", 13:"thirteen",
14:"fourteen", 16:"sixteen", 17:"seventeen", 18:"eighteen", 19:"nineteen", 20:"twenty", 30:"thirty",
40:"forty", 50:"fifty"}
result = ""
if minute == 15:
result= "quarter past"
elif minute == 30:
result= "half past"
elif minute == 0:
pass
else:
if minute in dictionary:
result = dictionary[minute] + " minutes past"
else:
minute1 = int(str(minute // 10 ) + "0")
minute2 = minute % 10
result = dictionary[minute1] + ' ' + dictionary[minute2] + " minutes past"
if hour == 0:
suffix = "mid night"
elif hour >= 1 and hour <= 11:
suffix = "morning"
elif hour == 12:
suffix = "noon"
elif hour > 12 and hour <=16:
suffix = "after noon"
elif hour > 16 and hour <=19:
suffix = "evening"
elif hour > 20 and hour <=23:
suffix = "night"
save_hour = hour
if hour > 12:
hour = hour - 12
if hour > 0:
# Lets introduce some variation in the way how hours an sufffixes are formed, just for randomness
if hour % 2 == 0:
result = result + " " + dictionary[hour]+ " in the " + suffix
else:
result = result + " " + dictionary[hour]+ " " + suffix
else:
result = result + " " + suffix
if sentence != '':
result = sentence.replace('#@#', result)
return save_hour, minute, result
# Random sentence templates to shove our time compnents into to form propert english sentences
sentence=[
'The murder happened exactly #@#',
'#@#, was the time on the clock when I entered the house',
'Time flies its #@# now',
'Really was it #@# twice in a row?'
]
def train():
data = []
i = 0
while i < 200000:
hour = random.randint(0,23)
minute = random.randint(0,59)
sent = random.randint(0,3)
hour, minute, result = generate_data(hour, minute, sentence[sent])
inwords = result
data.append({"inwords":inwords, "hour": hour, "minute":minute})
i += 1
df = pd.DataFrame(data)
#df.columns = ['inwords', 'hour', 'minute']
return df
def test():
data = []
i = 0
while i < 20000:
hour = random.randint(10,15)
minute = random.randint(0,59)
sent = random.randint(0,3)
hour, minute, result = generate_data(hour, minute, sentence[sent])
inwords = result
data.append({"inwords":inwords, "hour": hour, "minute":minute})
i += 1
df = pd.DataFrame(data)
#df.columns = ['inwords', 'hour', 'minute']
return df
train_data_raw = train()
test_data_raw = test()
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# print(os.listdir("/content/drive/My Drive"))
train_data_raw.head()
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
vocab_size = 5000 # based on words in the entire corpus
max_len = 25 # based on word count in phrases
train_phrases = list(train_data_raw['inwords'].values)
test_phrases = list(test_data_raw['inwords'].values)
train_target = pd.get_dummies(train_data_raw['hour'].values)
#Vocabulary-Indexing of the train and test phrases, make sure "filters" parm doesn't clean out punctuations which you we dont intend to
tokenizer = Tokenizer(num_words=vocab_size, lower=True, filters=',?.\n\t')
tokenizer.fit_on_texts(train_phrases + test_phrases)
encoded_train_phrases = tokenizer.texts_to_sequences(train_phrases)
encoded_test_phrases = tokenizer.texts_to_sequences(test_phrases)
#Watch for a POST padding, as opposed to the default PRE padding
X_train_words = sequence.pad_sequences(encoded_train_phrases, maxlen=max_len, padding='post')
X_test_words = sequence.pad_sequences(encoded_test_phrases, maxlen=max_len, padding='post')
print (X_train_words.shape)
print (X_test_words.shape)
print (train_target.shape)
print ('Done Tokenizing and indexing phrases based on the vocabulary learned from the entire Train and Test corpus')
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Input, Embedding, Dropout, CuDNNLSTM, CuDNNGRU, Flatten, TimeDistributed, RepeatVector
from keras.layers import Bidirectional
from keras.models import Model
print("Building layers")
print('starting to stitch and compile model')
# Embedding layer for text inputs
input_words = Input((max_len,))
x_words = Embedding(vocab_size, 300, input_length=max_len)(input_words)
x_words = Bidirectional(CuDNNLSTM(128))(x_words)
x_words = Dropout(0.2)(x_words)
x_words = Dense(32, activation="relu")(x_words)
predictions = Dense(24, activation="softmax")(x_words)
model = Model(inputs=input_words, outputs=predictions)
model.compile(optimizer='rmsprop' ,loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
early_stop = EarlyStopping(monitor = "val_loss", mode="min", patience = 3, verbose=1)
#fit the model
nb_epoch = 10
history = model.fit(X_train_words, train_target, epochs=nb_epoch, verbose=1, batch_size = 256, callbacks=[early_stop], validation_split = 0.2, shuffle=True)
train_loss = np.mean(history.history['loss'])
val_loss = np.mean(history.history['val_loss'])
print('Train loss: %f' % (train_loss*100))
print('Validation loss: %f' % (val_loss*100))
pred_test = model.predict(X_test_words, batch_size=128, verbose = 0)
print (pred_test.shape)
max_pred = np.floor(np.argmax(pred_test, axis=1)).astype(int)
submission = pd.DataFrame({'Inwords':test_data_raw['inwords'],'Predicted': max_pred, 'Truth': test_data_raw['hour']})
submission = submission[['Inwords', 'Truth','Predicted']]
submission.head()
unseen = ["Lets say, we meet three morning tommorrow ?"]
tokenizer.fit_on_texts(unseen)
encoded_unseen_phrases = tokenizer.texts_to_sequences(unseen)
X_unseen_words = sequence.pad_sequences(encoded_unseen_phrases, maxlen=max_len, padding='post')
pred_unseen = model.predict(X_unseen_words, batch_size=128, verbose = 0)
max_pred_unseen = np.floor(np.argmax(pred_unseen, axis=1)).astype(int)
print(max_pred_unseen)