Notebook

Part 1: Recurrent Neural Network¶

Importing packages¶

In [0]:

import re
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.datasets import imdb

from keras.utils.np_utils import to_categorical

import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

Preparing Dataset¶

In [4]:

max_features = 1000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
17465344/17464789 [==============================] - 1s 0us/step
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)

Visualize the data¶

In [5]:

INDEX_FROM=3   # word index offset

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[0] ))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 1s 0us/step
that played the <UNK> of <UNK> and paul they were just brilliant children are often left out of the <UNK> <UNK> i think because the stars that play them all <UNK> up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they have done don't you think the whole story was so <UNK> because it was true and was <UNK> life after all that was <UNK> with us all

Building a Model¶

In [2]:

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Build model...
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-7601b68ed35e> in <module>()
      1 print('Build model...')
      2 model = Sequential()
----> 3 model.add(Embedding(max_features, 8))
      4 model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
      5 model.add(Dense(1, activation='sigmoid'))

NameError: name 'max_features' is not defined

Model Training¶

In [0]:

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Write the training input and output, batch size, and testing input and output

model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3657: The name tf.log is deprecated. Please use tf.math.log instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
25000/25000 [==============================] - 52s 2ms/step - loss: 0.5450 - acc: 0.7235 - val_loss: 0.4540 - val_acc: 0.7929

Out[0]:

<keras.callbacks.History at 0x7fc2daaf4c18>

Testing¶

In [0]:

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

24960/25000 [============================>.] - ETA: 0sTest score: 0.49805993225097656
Test accuracy: 0.75544

Prediction¶

In [0]:

prediction = model.predict(x_test[2:3])
print('Prediction value:',prediction[0])
print('Test Label:',y_test[2:3])
print(' '.join(id_to_word[id] for id in x_test[25] ))

Prediction value: [0.82189775]
Test Label: [1]
<UNK> that should be <UNK> viewing for all <UNK> <UNK> has its <UNK> as well but for other than <UNK> reason <UNK> today is a <UNK> example of the left in full <UNK> <UNK> <UNK> <UNK> <UNK> and given the times <UNK> <UNK> <UNK> the <UNK> 7 such <UNK> <UNK> seemed not that great a <UNK> from the truth but <UNK> years later the <UNK> has <UNK> and <UNK> <UNK> with it's <UNK> <UNK> <UNK> is a pretty silly <UNK>

Other RNN Layers¶

keras.layers.RNN(cell, return_sequences=False)
keras.layers.SimpleRNN(units, activation='tanh')
keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid')
keras.layers.ConvLSTM2D(filters, kernel_size, strides=(1, 1), padding='valid', )
keras.layers.SimpleRNNCell(units, activation='tanh')
keras.layers.GRUCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
keras.layers.LSTMCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform')
keras.layers.CuDNNLSTM(units, kernel_initializer='glorot_uniform')

Part 2: RNN Design Choices¶

Influence of number of nodes¶

LSTM with 8 nodes¶

In [0]:

model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_3 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
=================================================================
Total params: 8,553
Trainable params: 8,553
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
25000/25000 [==============================] - 69s - loss: 0.5377 - acc: 0.7186 - val_loss: 0.4416 - val_acc: 0.7936
24960/25000 [============================>.] - ETA: 0sTest score: 0.441626269493103
Test accuracy: 0.79364

LSTM with 16 nodes¶

In [0]:

#not tried
# Write your code here 
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
# Use the same layer design from the above cell 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_4 (LSTM)                (None, 16)                1600      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
=================================================================
Total params: 9,617
Trainable params: 9,617
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
25000/25000 [==============================] - 70s - loss: 0.5149 - acc: 0.7333 - val_loss: 0.4129 - val_acc: 0.8124
24960/25000 [============================>.] - ETA: 0sTest score: 0.4128888432312012
Test accuracy: 0.81236

Influence of Embedding¶

In [0]:

model = Sequential()
model.add(Embedding(max_features, 4))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_5 (Embedding)      (None, None, 4)           4000      
_________________________________________________________________
lstm_5 (LSTM)                (None, 16)                1344      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
=================================================================
Total params: 5,361
Trainable params: 5,361
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
25000/25000 [==============================] - 66s - loss: 0.5176 - acc: 0.7263 - val_loss: 0.4116 - val_acc: 0.8124
24960/25000 [============================>.] - ETA: 0sTest score: 0.41163202223777773
Test accuracy: 0.81236

Influence of Dropout¶

Dropout with probability 0.5¶

In [0]:

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(8, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_6 (Embedding)      (None, None, 32)          32000     
_________________________________________________________________
lstm_6 (LSTM)                (None, 8)                 1312      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 9         
=================================================================
Total params: 33,321
Trainable params: 33,321
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
25000/25000 [==============================] - 74s - loss: 0.6050 - acc: 0.6698 - val_loss: 0.5219 - val_acc: 0.7405
24896/25000 [============================>.] - ETA: 0sTest score: 0.521893192024231
Test accuracy: 0.74052

Dropout with probability 0.9¶

In [0]:

# Write your code here 

# Use the same model design from the above cell 

Multilayered RNNs¶

RNN with 2 layer LSTM¶

In [0]:

model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_7 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_7 (LSTM)                (None, None, 8)           544       
_________________________________________________________________
lstm_8 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 9         
=================================================================
Total params: 9,097
Trainable params: 9,097
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
25000/25000 [==============================] - 104s - loss: 0.5088 - acc: 0.7404 - val_loss: 0.4145 - val_acc: 0.8114
24928/25000 [============================>.] - ETA: 0sTest score: 0.4145219009399414
Test accuracy: 0.81136

RNN with 3 layer LSTM¶

In [0]:

# Write your code here 
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
# Use the same node design from the above cell 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_8 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_9 (LSTM)                (None, None, 8)           544       
_________________________________________________________________
lstm_10 (LSTM)               (None, 8)                 544       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 9         
=================================================================
Total params: 9,097
Trainable params: 9,097
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
25000/25000 [==============================] - 107s - loss: 0.5471 - acc: 0.7128 - val_loss: 0.4698 - val_acc: 0.7706
24960/25000 [============================>.] - ETA: 0sTest score: 0.4697502194404602
Test accuracy: 0.7706

What are your findings?¶

Part 3: Recurrent Neural Network with Custom Dataset¶

In [0]:

# Credits to Peter Nagy

Load data¶

In [0]:

data = pd.read_csv('Senti.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

Visualize data¶

In [0]:

data.head(10)

Out[0]:

	text	sentiment
0	I love this car	Positive
1	This view is amazing	Positive
2	I feel great this morning	Positive
3	I am so excited about the concert	Positive
4	He is my best friend	Positive
5	I do not like this car	Negative
6	This view is horrible	Negative
7	I feel tired this morning	Negative
8	I am not looking forward to the concert	Negative
9	He is my enemy	Negative

Format data¶

In [0]:

data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print('Number of positive samples:',data[ data['sentiment'] == 'Positive'].size)
print('Number of negative samples:',data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

max_fatures = 2000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

Number of positive samples: 10
Number of negative samples: 10

Training set¶

In [0]:

Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print('Shape of training samples:',X_train.shape,Y_train.shape)
print('Shape of testing samples:',X_test.shape,Y_test.shape)

Shape of training samples: (6, 8) (6, 2)
Shape of testing samples: (4, 8) (4, 2)

Design a model¶

In [0]:

model = Sequential()
model.add(Embedding(max_fatures, 128 ,input_length = X.shape[1], dropout=0.2))
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 8, 128)            256000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
=================================================================
Total params: 387,842
Trainable params: 387,842
Non-trainable params: 0
_________________________________________________________________
None

Training¶

In [0]:

batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)

Epoch 1/5
0s - loss: 0.6946 - acc: 0.3333
Epoch 2/5
0s - loss: 0.6864 - acc: 0.6667
Epoch 3/5
0s - loss: 0.6782 - acc: 0.6667
Epoch 4/5
0s - loss: 0.6698 - acc: 0.6667
Epoch 5/5
0s - loss: 0.6607 - acc: 0.6667

Out[0]:

<keras.callbacks.History at 0x7f887cbf3390>

Validation¶

In [0]:

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Accuracy: %.2f" % (acc))

Score: 0.73
Accuracy: 0.25

Formatting Test Example¶

In [0]:

text = 'He is my enemy'
tester = np.array([text])
tester = pd.DataFrame(tester)
tester.columns = ['text']

tester['text'] = tester['text'].apply(lambda x: x.lower())
tester['text'] = tester['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

max_fatures = 2000
test = tokenizer.texts_to_sequences(tester['text'].values)
test = pad_sequences(test)

if X.shape[1]>test.shape[1]:
    test = np.pad(test[0], (X.shape[1]-test.shape[1],0), 'constant')
    
test = np.array([test])

prediction = model.predict(test)
print('Prediction value:',prediction[0])

Prediction value: [0.53419375 0.46580625]

In [0]: