In this notebook I tried to explore the basic sequence model i.e using RNN(and its variants) to build a N:1 sequence model. Here N:1 means, the model takes a sequence (a natural language text) as input and makes a prediction based on the training data. In NLP literature these models are called RNN acceptors but Andrej karpathy made the notion of calling RNN's as N:1 (or N:N or or 1:N) in his famous blog post, The unreasonable effectiveness of RNNs

This is tested on tensorflow-gpu=1.13.1

In [4]:
train_data_raw.head()
Out[4]:
hour inwords minute
0 16 Time flies its thirty one minutes past four in the after noon now 31
1 0 Time flies its nine minutes past mid night now 9
2 13 fifty five minutes past one after noon, was the time on the clock when I entered the house 55
3 8 four minutes past eight in the morning, was the time on the clock when I entered the house 4
4 16 ten minutes past four in the after noon, was the time on the clock when I entered the house 10
In [1]:
import tensorflow as tf
from keras import backend
import logging
import numpy as np
import pandas as pd
import random 
pd.set_option('display.max_colwidth', -1)


#To have reproducability: Set all the seeds, make sure multithreading is off, if possible don't use GPU. 
tf.set_random_seed(7)
np.random.seed(7)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))
Using TensorFlow backend.
In [0]:
# Synthetic Training and Test data

def generate_data(hour,  minute, sentence=''):
  
    special  = [15,30]
    suffix = ""

    #print(hour, minute)

    dictionary = {1:"one", 2:"two", 3:"three", 4:"four", 5:"five", 6:"six", 7:"seven", 8:"eight", 9:"nine", 10:"ten", 11:"eleven", 12:"twelve", 13:"thirteen", 
                  14:"fourteen", 16:"sixteen", 17:"seventeen", 18:"eighteen", 19:"nineteen", 20:"twenty", 30:"thirty",
                  40:"forty", 50:"fifty"}
    result = ""
    if minute == 15:
        result= "quarter past"
    elif minute == 30:
        result= "half past"    
    elif minute == 0:
        pass
    else:

        if minute in dictionary:
            result = dictionary[minute] + " minutes past"
        else:        
            minute1 = int(str(minute // 10 ) + "0") 
            minute2 = minute % 10
            result = dictionary[minute1] + ' ' +  dictionary[minute2] + " minutes past"

    if hour == 0:
        suffix = "mid night"
    elif hour >= 1 and hour <= 11:
        suffix = "morning"
    elif hour == 12:
        suffix = "noon"
    elif hour > 12 and hour <=16:   
        suffix = "after noon"
    elif hour > 16 and hour <=19:   
        suffix = "evening"
    elif hour > 20 and hour <=23:   
        suffix = "night"

    save_hour = hour            
    if hour > 12:
        hour = hour - 12
    
    if hour > 0:
        # Lets introduce some variation in the way how hours an sufffixes are formed, just for randomness
        if hour % 2 == 0:
           result = result + " " + dictionary[hour]+ " in the " + suffix  
        else:    
           result = result + " " + dictionary[hour]+ " " + suffix          
    else:
        result = result + " " + suffix  
          
    if sentence != '':
        result = sentence.replace('#@#', result)
    
    return save_hour, minute, result

  
  
# Random sentence templates to shove our time compnents into to form propert english sentences
sentence=[
    'The murder happened exactly #@#',
    '#@#, was the time on the clock when I entered the house',
    'Time flies its #@# now',
    'Really was it #@# twice in a row?'
]


def train():  
  data = []
  i = 0
  while i < 200000:
      hour = random.randint(0,23)
      minute = random.randint(0,59)
      sent = random.randint(0,3)
      hour, minute, result = generate_data(hour, minute, sentence[sent])
      inwords = result
      data.append({"inwords":inwords, "hour": hour, "minute":minute})
      i += 1
  df = pd.DataFrame(data)
  #df.columns = ['inwords', 'hour', 'minute']
  return df

def test():  
  data = []
  i = 0
  while i < 20000:
      hour = random.randint(10,15)
      minute = random.randint(0,59)
      sent = random.randint(0,3)
      hour, minute, result = generate_data(hour, minute, sentence[sent])
      inwords = result
      data.append({"inwords":inwords, "hour": hour, "minute":minute})
      i += 1
  df = pd.DataFrame(data)   
  #df.columns = ['inwords', 'hour', 'minute']
  return df
      
    

train_data_raw = train()
test_data_raw = test()

  
In [0]:
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# print(os.listdir("/content/drive/My Drive"))
In [3]:
train_data_raw.head()
Out[3]:
hour inwords minute
0 16 Time flies its thirty one minutes past four in the after noon now 31
1 0 Time flies its nine minutes past mid night now 9
2 13 fifty five minutes past one after noon, was the time on the clock when I entered the house 55
3 8 four minutes past eight in the morning, was the time on the clock when I entered the house 4
4 16 ten minutes past four in the after noon, was the time on the clock when I entered the house 10
In [5]:
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


vocab_size = 5000  # based on words in the entire corpus
max_len = 25      # based on word count in phrases

train_phrases  = list(train_data_raw['inwords'].values) 
test_phrases   = list(test_data_raw['inwords'].values) 
train_target   = pd.get_dummies(train_data_raw['hour'].values)

#Vocabulary-Indexing of the train and test phrases, make sure "filters" parm doesn't clean out punctuations which you we dont intend to

tokenizer = Tokenizer(num_words=vocab_size, lower=True, filters=',?.\n\t')
tokenizer.fit_on_texts(train_phrases + test_phrases)
encoded_train_phrases = tokenizer.texts_to_sequences(train_phrases)
encoded_test_phrases  = tokenizer.texts_to_sequences(test_phrases)


#Watch for a POST padding, as opposed to the default PRE padding
X_train_words = sequence.pad_sequences(encoded_train_phrases, maxlen=max_len,  padding='post')
X_test_words  = sequence.pad_sequences(encoded_test_phrases,  maxlen=max_len,  padding='post')


print (X_train_words.shape)
print (X_test_words.shape)
print (train_target.shape)

print ('Done Tokenizing and indexing phrases based on the vocabulary learned from the entire Train and Test corpus')
(200000, 25)
(20000, 25)
(200000, 24)
Done Tokenizing and indexing phrases based on the vocabulary learned from the entire Train and Test corpus
In [6]:
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Input, Embedding, Dropout, CuDNNLSTM, CuDNNGRU, Flatten, TimeDistributed, RepeatVector
from keras.layers import Bidirectional
from keras.models import Model



print("Building layers")        

print('starting to stitch and compile  model')

# Embedding layer for text inputs
input_words = Input((max_len,))
x_words = Embedding(vocab_size, 300, input_length=max_len)(input_words)
x_words = Bidirectional(CuDNNLSTM(128))(x_words)
x_words = Dropout(0.2)(x_words)
x_words = Dense(32, activation="relu")(x_words)
predictions = Dense(24, activation="softmax")(x_words)
model = Model(inputs=input_words, outputs=predictions)
model.compile(optimizer='rmsprop' ,loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
Building layers
starting to stitch and compile  model
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         (None, 25)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 25, 300)           1500000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               440320    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_2 (Dense)              (None, 24)                792       
=================================================================
Total params: 1,949,336
Trainable params: 1,949,336
Non-trainable params: 0
_________________________________________________________________
None
In [7]:
early_stop = EarlyStopping(monitor = "val_loss", mode="min", patience = 3, verbose=1)
#fit the model
nb_epoch = 10
history = model.fit(X_train_words, train_target, epochs=nb_epoch, verbose=1, batch_size = 256, callbacks=[early_stop], validation_split = 0.2, shuffle=True)
train_loss = np.mean(history.history['loss'])
val_loss = np.mean(history.history['val_loss'])
print('Train loss: %f' % (train_loss*100))
print('Validation loss: %f' % (val_loss*100))
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Train on 160000 samples, validate on 40000 samples
Epoch 1/10
160000/160000 [==============================] - 11s 69us/step - loss: 0.3757 - acc: 0.8870 - val_loss: 0.0012 - val_acc: 1.0000
Epoch 2/10
160000/160000 [==============================] - 9s 54us/step - loss: 0.0039 - acc: 0.9991 - val_loss: 8.9929e-06 - val_acc: 1.0000
Epoch 3/10
160000/160000 [==============================] - 9s 54us/step - loss: 4.1068e-04 - acc: 0.9999 - val_loss: 2.3119e-07 - val_acc: 1.0000
Epoch 4/10
160000/160000 [==============================] - 9s 54us/step - loss: 4.9124e-06 - acc: 1.0000 - val_loss: 1.2814e-07 - val_acc: 1.0000
Epoch 5/10
160000/160000 [==============================] - 8s 53us/step - loss: 3.2206e-07 - acc: 1.0000 - val_loss: 1.2507e-07 - val_acc: 1.0000
Epoch 6/10
160000/160000 [==============================] - 8s 52us/step - loss: 1.2597e-07 - acc: 1.0000 - val_loss: 1.1973e-07 - val_acc: 1.0000
Epoch 7/10
160000/160000 [==============================] - 8s 52us/step - loss: 1.2542e-07 - acc: 1.0000 - val_loss: 1.1957e-07 - val_acc: 1.0000
Epoch 8/10
160000/160000 [==============================] - 8s 52us/step - loss: 1.2197e-07 - acc: 1.0000 - val_loss: 1.1935e-07 - val_acc: 1.0000
Epoch 9/10
160000/160000 [==============================] - 8s 53us/step - loss: 1.2094e-07 - acc: 1.0000 - val_loss: 1.1927e-07 - val_acc: 1.0000
Epoch 10/10
160000/160000 [==============================] - 8s 52us/step - loss: 1.2126e-07 - acc: 1.0000 - val_loss: 1.1924e-07 - val_acc: 1.0000
Train loss: 3.799861
Validation loss: 0.012004
In [8]:
pred_test = model.predict(X_test_words, batch_size=128, verbose = 0)
print (pred_test.shape) 
max_pred = np.floor(np.argmax(pred_test, axis=1)).astype(int)
submission = pd.DataFrame({'Inwords':test_data_raw['inwords'],'Predicted': max_pred, 'Truth': test_data_raw['hour']})
submission = submission[['Inwords', 'Truth','Predicted']]
(20000, 24)
In [9]:
submission.head()
Out[9]:
Inwords Truth Predicted
0 Time flies its forty four minutes past ten in the morning now 10 10
1 The murder happened exactly thirteen minutes past one after noon 13 13
2 twenty nine minutes past one after noon, was the time on the clock when I entered the house 13 13
3 Time flies its twenty five minutes past three after noon now 15 15
4 Really was it thirty nine minutes past two in the after noon twice in a row? 14 14
In [10]:
unseen = ["Lets say, we meet three morning tommorrow ?"]
tokenizer.fit_on_texts(unseen)
encoded_unseen_phrases  = tokenizer.texts_to_sequences(unseen)
X_unseen_words  = sequence.pad_sequences(encoded_unseen_phrases,  maxlen=max_len,  padding='post')
pred_unseen = model.predict(X_unseen_words, batch_size=128, verbose = 0)
max_pred_unseen = np.floor(np.argmax(pred_unseen, axis=1)).astype(int)
print(max_pred_unseen)
[3]