Data Description
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
# Setting the current working directory
import os; os.chdir('drive/My Drive/Great Learning/NLP')
# Import packages
import pandas as pd, numpy as np
import tensorflow as tf
assert tf.__version__ >= '2.0'
from itertools import islice
# Keras
from keras.layers import Dense, Embedding, LSTM, Dropout, MaxPooling1D, Conv1D
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.preprocessing import sequence
from keras.datasets import imdb
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Suppress warnings
import warnings; warnings.filterwarnings('ignore')
random_state = 42
np.random.seed(random_state)
tf.random.set_seed(random_state)
Using TensorFlow backend.
vocab_size = 10000
maxlen = 300
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = vocab_size)
x_train = pad_sequences(x_train, maxlen = maxlen, padding = 'pre')
x_test = pad_sequences(x_test, maxlen = maxlen, padding = 'pre')
X = np.concatenate((x_train, x_test), axis = 0)
y = np.concatenate((y_train, y_test), axis = 0)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state, shuffle = True)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2, random_state = random_state, shuffle = True)
print('---'*20, f'\nNumber of rows in training dataset: {x_train.shape[0]}')
print(f'Number of columns in training dataset: {x_train.shape[1]}')
print(f'Number of unique words in training dataset: {len(np.unique(np.hstack(x_train)))}')
print('---'*20, f'\nNumber of rows in validation dataset: {x_valid.shape[0]}')
print(f'Number of columns in validation dataset: {x_valid.shape[1]}')
print(f'Number of unique words in validation dataset: {len(np.unique(np.hstack(x_valid)))}')
print('---'*20, f'\nNumber of rows in test dataset: {x_test.shape[0]}')
print(f'Number of columns in test dataset: {x_test.shape[1]}')
print(f'Number of unique words in test dataset: {len(np.unique(np.hstack(x_test)))}')
print('---'*20, f'\nUnique Categories: {np.unique(y_train), np.unique(y_valid), np.unique(y_test)}')
------------------------------------------------------------ Number of rows in training dataset: 32000 Number of columns in training dataset: 300 Number of unique words in training dataset: 9999 ------------------------------------------------------------ Number of rows in validation dataset: 8000 Number of columns in validation dataset: 300 Number of unique words in validation dataset: 9984 ------------------------------------------------------------ Number of rows in test dataset: 10000 Number of columns in test dataset: 300 Number of unique words in test dataset: 9995 ------------------------------------------------------------ Unique Categories: (array([0, 1]), array([0, 1]), array([0, 1]))
def decode_review(x, y):
w2i = imdb.get_word_index()
w2i = {k:(v + 3) for k, v in w2i.items()}
w2i['<PAD>'] = 0
w2i['<START>'] = 1
w2i['<UNK>'] = 2
i2w = {i: w for w, i in w2i.items()}
ws = (' '.join(i2w[i] for i in x))
print(f'Review: {ws}')
print(f'Actual Sentiment: {y}')
return w2i, i2w
w2i, i2w = decode_review(x_train[0], y_train[0])
# get first 50 key, value pairs from id to word dictionary
print('---'*30, '\n', list(islice(i2w.items(), 0, 50)))
Review: <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <START> the only possible way to enjoy this flick is to bang your head against the wall allow some internal <UNK> of the brain let a bunch of your brain cells die and once you are officially mentally retarded perhaps then you might enjoy this film br br the only saving grace was the story between <UNK> and stephanie govinda was excellent in the role of the cab driver and so was the brit girl perhaps if they would have created the whole movie on their <UNK> in india and how they eventually fall in love would have made it a much more enjoyable film br br the only reason i gave it a 3 rating is because of <UNK> and his ability as an actor when it comes to comedy br br <UNK> <UNK> and anil kapoor were wasted needlessly plus the scene at <UNK> of the re union was just too much to <UNK> being an international <UNK> in the post 9 11 world anil kapoor would have got himself shot much before he even reached the sky bridge to <UNK> his true love but then again the point of the movie was to defy logic gravity physics and throw an egg on the face of the general audience br br watch it at your own peril at least i know i have been <UNK> for life Actual Sentiment: 0 ------------------------------------------------------------------------------------------ [(34704, 'fawn'), (52009, 'tsukino'), (52010, 'nunnery'), (16819, 'sonja'), (63954, 'vani'), (1411, 'woods'), (16118, 'spiders'), (2348, 'hanging'), (2292, 'woody'), (52011, 'trawling'), (52012, "hold's"), (11310, 'comically'), (40833, 'localized'), (30571, 'disobeying'), (52013, "'royale"), (40834, "harpo's"), (52014, 'canet'), (19316, 'aileen'), (52015, 'acurately'), (52016, "diplomat's"), (25245, 'rickman'), (6749, 'arranged'), (52017, 'rumbustious'), (52018, 'familiarness'), (52019, "spider'"), (68807, 'hahahah'), (52020, "wood'"), (40836, 'transvestism'), (34705, "hangin'"), (2341, 'bringing'), (40837, 'seamier'), (34706, 'wooded'), (52021, 'bravora'), (16820, 'grueling'), (1639, 'wooden'), (16821, 'wednesday'), (52022, "'prix"), (34707, 'altagracia'), (52023, 'circuitry'), (11588, 'crotch'), (57769, 'busybody'), (52024, "tart'n'tangy"), (14132, 'burgade'), (52026, 'thrace'), (11041, "tom's"), (52028, 'snuggles'), (29117, 'francesco'), (52030, 'complainers'), (52128, 'templarios'), (40838, '272')]
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:
The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn LabelEncoder.
# Model
model = Sequential()
model.add(Embedding(vocab_size, 256, input_length = maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(256, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(Conv1D(128, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(MaxPooling1D(pool_size = 2))
model.add(Conv1D(64, 5, padding = 'same', activation = 'relu', strides = 1))
model.add(MaxPooling1D(pool_size = 2))
model.add(LSTM(75))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())
# Adding callbacks
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 0)
mc = ModelCheckpoint('imdb_model.h5', monitor = 'val_loss', mode = 'min', save_best_only = True, verbose = 1)
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, 300, 256) 2560000 _________________________________________________________________ dropout_1 (Dropout) (None, 300, 256) 0 _________________________________________________________________ conv1d_1 (Conv1D) (None, 300, 256) 327936 _________________________________________________________________ conv1d_2 (Conv1D) (None, 300, 128) 163968 _________________________________________________________________ max_pooling1d_1 (MaxPooling1 (None, 150, 128) 0 _________________________________________________________________ conv1d_3 (Conv1D) (None, 150, 64) 41024 _________________________________________________________________ max_pooling1d_2 (MaxPooling1 (None, 75, 64) 0 _________________________________________________________________ lstm_1 (LSTM) (None, 75) 42000 _________________________________________________________________ dense_1 (Dense) (None, 1) 76 ================================================================= Total params: 3,135,004 Trainable params: 3,135,004 Non-trainable params: 0 _________________________________________________________________ None
# Fit the model
model.fit(x_train, y_train, validation_data = (x_valid, y_valid), epochs = 3, batch_size = 64, verbose = True, callbacks = [es, mc])
# Evaluate the model
scores = model.evaluate(x_test, y_test, batch_size = 64)
print('Test accuracy: %.2f%%' % (scores[1]*100))
Train on 32000 samples, validate on 8000 samples Epoch 1/3 32000/32000 [==============================] - 77s 2ms/step - loss: 0.3472 - accuracy: 0.8342 - val_loss: 0.2467 - val_accuracy: 0.8984 Epoch 00001: val_loss improved from inf to 0.24669, saving model to imdb_model.h5 Epoch 2/3 32000/32000 [==============================] - 75s 2ms/step - loss: 0.1824 - accuracy: 0.9311 - val_loss: 0.2559 - val_accuracy: 0.8997 Epoch 00002: val_loss did not improve from 0.24669 Epoch 00002: early stopping 10000/10000 [==============================] - 2s 190us/step Test accuracy: 90.14%
y_pred = model.predict_classes(x_test)
print(f'Classification Report:\n{classification_report(y_pred, y_test)}')
Classification Report: precision recall f1-score support 0 0.92 0.89 0.90 5086 1 0.89 0.92 0.90 4914 accuracy 0.90 10000 macro avg 0.90 0.90 0.90 10000 weighted avg 0.90 0.90 0.90 10000
sample_x_test = x_test[np.random.randint(10000)]
for layer in model.layers:
model_layer = Model(inputs = model.input, outputs = model.get_layer(layer.name).output)
output = model_layer.predict(sample_x_test.reshape(1,-1))
print('\n','--'*20, layer.name, 'layer', '--'*20, '\n')
print(output)
---------------------------------------- embedding_1 layer ---------------------------------------- [[[ 4.74077724e-02 -1.45893563e-02 -1.92809459e-02 ... 1.59389190e-02 -3.90756801e-02 -6.46728724e-02] [ 4.74077724e-02 -1.45893563e-02 -1.92809459e-02 ... 1.59389190e-02 -3.90756801e-02 -6.46728724e-02] [ 4.74077724e-02 -1.45893563e-02 -1.92809459e-02 ... 1.59389190e-02 -3.90756801e-02 -6.46728724e-02] ... [-5.12011871e-02 2.73237063e-04 -3.15764773e-05 ... 4.48421352e-02 2.12928746e-02 -1.26087647e-02] [ 6.66740909e-02 1.52700637e-02 -7.01705664e-02 ... -9.86870304e-02 4.93544117e-02 -3.51153836e-02] [-3.40692252e-02 -4.36996408e-02 4.43636142e-02 ... 1.14621185e-02 2.80509088e-02 -2.31574550e-02]]] ---------------------------------------- dropout_1 layer ---------------------------------------- [[[ 4.74077724e-02 -1.45893563e-02 -1.92809459e-02 ... 1.59389190e-02 -3.90756801e-02 -6.46728724e-02] [ 4.74077724e-02 -1.45893563e-02 -1.92809459e-02 ... 1.59389190e-02 -3.90756801e-02 -6.46728724e-02] [ 4.74077724e-02 -1.45893563e-02 -1.92809459e-02 ... 1.59389190e-02 -3.90756801e-02 -6.46728724e-02] ... [-5.12011871e-02 2.73237063e-04 -3.15764773e-05 ... 4.48421352e-02 2.12928746e-02 -1.26087647e-02] [ 6.66740909e-02 1.52700637e-02 -7.01705664e-02 ... -9.86870304e-02 4.93544117e-02 -3.51153836e-02] [-3.40692252e-02 -4.36996408e-02 4.43636142e-02 ... 1.14621185e-02 2.80509088e-02 -2.31574550e-02]]] ---------------------------------------- conv1d_1 layer ---------------------------------------- [[[0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] ... [0. 0.03142974 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0.04410822 0. ] [0.00265946 0. 0. ... 0. 0. 0. ]]] ---------------------------------------- conv1d_2 layer ---------------------------------------- [[[0. 0. 0. ... 0. 0. 0.00992592] [0. 0. 0. ... 0. 0. 0.00992592] [0. 0. 0. ... 0. 0. 0.00992592] ... [0. 0. 0. ... 0. 0.00156896 0.05785441] [0. 0. 0. ... 0. 0. 0.04214466] [0. 0. 0. ... 0. 0.02336704 0.0316269 ]]] ---------------------------------------- max_pooling1d_1 layer ---------------------------------------- [[[0. 0. 0. ... 0. 0. 0.00992592] [0. 0. 0. ... 0. 0. 0.00992592] [0. 0. 0. ... 0. 0. 0.00992592] ... [0. 0. 0. ... 0. 0.00145619 0. ] [0. 0. 0. ... 0. 0.00156896 0.05785441] [0. 0. 0. ... 0. 0.02336704 0.04214466]]] ---------------------------------------- conv1d_3 layer ---------------------------------------- [[[0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] ... [0.06621806 0. 0. ... 0. 0. 0. ] [0.03402259 0. 0. ... 0. 0. 0. ] [0.02756308 0. 0. ... 0. 0.00653153 0. ]]] ---------------------------------------- max_pooling1d_2 layer ---------------------------------------- [[[0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] [0. 0. 0. ... 0. 0. 0. ] ... [0.09239879 0. 0. ... 0. 0.14975896 0. ] [0.10864462 0. 0. ... 0. 0. 0. ] [0.03402259 0. 0. ... 0. 0.00653153 0. ]]] ---------------------------------------- lstm_1 layer ---------------------------------------- [[ 0.50351334 -0.02411361 -0.5454778 -0.7051705 0.7910843 0.60205024 -0.5404606 0.52277875 -0.7547373 0.50682384 0.7195332 -0.53433776 -0.74312335 -0.02459037 -0.11192464 -0.7976935 0.15080199 0.57083166 0.01432618 -0.7756157 -0.5437074 -0.6041164 0.02231176 -0.55837494 0.20796183 -0.75535905 -0.6613373 0.7095603 0.32622436 -0.52310455 0.11562354 0.5646972 -0.6707359 -0.5060398 0.7623417 -0.6992541 -0.04636298 0.43525052 -0.7030687 -0.02964382 0.0145278 -0.7140254 -0.1343891 -0.38830882 -0.00243724 -0.40578288 -0.01659107 0.00169612 0.08863628 0.6849975 -0.62493294 0.6887381 0.00755857 -0.48001266 -0.5722563 -0.571908 0.70052683 0.07025653 -0.08210693 -0.58302814 -0.00641777 0.7430728 -0.1640142 0.6192563 -0.21381459 0.03986506 -0.2063878 0.00244516 0.0595973 0.24460196 -0.68142074 -0.64712524 0.27605036 -0.6704749 0.03478413]] ---------------------------------------- dense_1 layer ---------------------------------------- [[0.00859277]]
decode_review(x_test[10], y_test[10])
print(f'Predicted sentiment: {y_pred[10][0]}')
Review: <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <START> this movie was great and i was waiting for it for a long time when it finally came out i was really happy and looked forward to a 10 out of 10 it was great and lived up to my potential the performances were great on the part of the adults and most of the kids the only bad performance was by milo himself there was one problem that i encountered with this and others like it movie all of the characters i wanted to live were getting killed overall i give this movie an excellent 9 out of 10 maybe we should <UNK> better people to kill next time though ok Actual Sentiment: 1 Predicted sentiment: 1