순환 신경망으로 IMDB 리뷰 분류하기

IMDB 리뷰 데이터셋

In [1]:
from tensorflow.keras.datasets import imdb

(train_input, train_target), (test_input, test_target) = imdb.load_data(
    num_words=500)
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 0s 0us/step
<string>:6: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/datasets/imdb.py:155: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/datasets/imdb.py:156: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
In [2]:
print(train_input.shape, test_input.shape)
(25000,) (25000,)
In [3]:
print(len(train_input[0]))
218
In [4]:
print(len(train_input[1]))
189
In [5]:
print(train_input[0])
[1, 14, 22, 16, 43, 2, 2, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 2, 112, 50, 2, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 2, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 2, 38, 76, 15, 13, 2, 4, 22, 17, 2, 17, 12, 16, 2, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 2, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 2, 36, 71, 43, 2, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
In [6]:
print(train_target[:20])
[1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1]
In [7]:
from sklearn.model_selection import train_test_split

train_input, val_input, train_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)
In [8]:
import numpy as np

lengths = np.array([len(x) for x in train_input])
In [9]:
print(np.mean(lengths), np.median(lengths))
239.00925 178.0
In [10]:
import matplotlib.pyplot as plt

plt.hist(lengths)
plt.xlabel('length')
plt.ylabel('frequency')
plt.show()
In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_seq = pad_sequences(train_input, maxlen=100)
In [12]:
print(train_seq.shape)
(20000, 100)
In [13]:
print(train_seq[0])
[ 10   4  20   9   2 364 352   5  45   6   2   2  33 269   8   2 142   2
   5   2  17  73  17 204   5   2  19  55   2   2  92  66 104  14  20  93
  76   2 151  33   4  58  12 188   2 151  12 215  69 224 142  73 237   6
   2   7   2   2 188   2 103  14  31  10  10 451   7   2   5   2  80  91
   2  30   2  34  14  20 151  50  26 131  49   2  84  46  50  37  80  79
   6   2  46   7  14  20  10  10 470 158]
In [14]:
print(train_input[0][-10:])
[6, 2, 46, 7, 14, 20, 10, 10, 470, 158]
In [15]:
print(train_seq[5])
[  0   0   0   0   1   2 195  19  49   2   2 190   4   2 352   2 183  10
  10  13  82  79   4   2  36  71 269   8   2  25  19  49   7   4   2   2
   2   2   2  10  10  48  25  40   2  11   2   2  40   2   2   5   4   2
   2  95  14 238  56 129   2  10  10  21   2  94 364 352   2   2  11 190
  24 484   2   7  94 205 405  10  10  87   2  34  49   2   7   2   2   2
   2   2 290   2  46  48  64  18   4   2]
In [16]:
val_seq = pad_sequences(val_input, maxlen=100)

순환 신경망 만들기

In [17]:
from tensorflow import keras

model = keras.Sequential()

model.add(keras.layers.SimpleRNN(8, input_shape=(100, 500)))
model.add(keras.layers.Dense(1, activation='sigmoid'))
In [18]:
train_oh = keras.utils.to_categorical(train_seq)
In [19]:
print(train_oh.shape)
(20000, 100, 500)
In [20]:
print(train_oh[0][0][:12])
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
In [21]:
print(np.sum(train_oh[0][0]))
1.0
In [22]:
val_oh = keras.utils.to_categorical(val_seq)
In [23]:
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
simple_rnn (SimpleRNN)       (None, 8)                 4072      
_________________________________________________________________
dense (Dense)                (None, 1)                 9         
=================================================================
Total params: 4,081
Trainable params: 4,081
Non-trainable params: 0
_________________________________________________________________

순환 신경망 훈련하기

In [24]:
rmsprop = keras.optimizers.RMSprop(learning_rate=1e-4)
model.compile(optimizer=rmsprop, loss='binary_crossentropy', 
              metrics=['accuracy'])

checkpoint_cb = keras.callbacks.ModelCheckpoint('best-simplernn-model.h5', 
                                                save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=3,
                                                  restore_best_weights=True)

history = model.fit(train_oh, train_target, epochs=100, batch_size=64,
                    validation_data=(val_oh, val_target),
                    callbacks=[checkpoint_cb, early_stopping_cb])
Epoch 1/100
313/313 [==============================] - 26s 71ms/step - loss: 0.7021 - accuracy: 0.5017 - val_loss: 0.6963 - val_accuracy: 0.5162
Epoch 2/100
313/313 [==============================] - 22s 70ms/step - loss: 0.6906 - accuracy: 0.5313 - val_loss: 0.6849 - val_accuracy: 0.5554
Epoch 3/100
313/313 [==============================] - 22s 70ms/step - loss: 0.6791 - accuracy: 0.5749 - val_loss: 0.6765 - val_accuracy: 0.5822
Epoch 4/100
313/313 [==============================] - 22s 70ms/step - loss: 0.6695 - accuracy: 0.6061 - val_loss: 0.6674 - val_accuracy: 0.6148
Epoch 5/100
313/313 [==============================] - 22s 70ms/step - loss: 0.6603 - accuracy: 0.6300 - val_loss: 0.6608 - val_accuracy: 0.6298
Epoch 6/100
313/313 [==============================] - 22s 69ms/step - loss: 0.6508 - accuracy: 0.6544 - val_loss: 0.6498 - val_accuracy: 0.6508
Epoch 7/100
313/313 [==============================] - 22s 69ms/step - loss: 0.6409 - accuracy: 0.6694 - val_loss: 0.6412 - val_accuracy: 0.6624
Epoch 8/100
313/313 [==============================] - 22s 69ms/step - loss: 0.6305 - accuracy: 0.6857 - val_loss: 0.6347 - val_accuracy: 0.6718
Epoch 9/100
313/313 [==============================] - 22s 69ms/step - loss: 0.6204 - accuracy: 0.6966 - val_loss: 0.6268 - val_accuracy: 0.6770
Epoch 10/100
313/313 [==============================] - 22s 70ms/step - loss: 0.6092 - accuracy: 0.7097 - val_loss: 0.6126 - val_accuracy: 0.6972
Epoch 11/100
313/313 [==============================] - 22s 70ms/step - loss: 0.5979 - accuracy: 0.7216 - val_loss: 0.6030 - val_accuracy: 0.7086
Epoch 12/100
313/313 [==============================] - 22s 69ms/step - loss: 0.5862 - accuracy: 0.7291 - val_loss: 0.5933 - val_accuracy: 0.7120
Epoch 13/100
313/313 [==============================] - 22s 69ms/step - loss: 0.5742 - accuracy: 0.7383 - val_loss: 0.5830 - val_accuracy: 0.7222
Epoch 14/100
313/313 [==============================] - 22s 69ms/step - loss: 0.5617 - accuracy: 0.7473 - val_loss: 0.5724 - val_accuracy: 0.7304
Epoch 15/100
313/313 [==============================] - 22s 69ms/step - loss: 0.5490 - accuracy: 0.7534 - val_loss: 0.5613 - val_accuracy: 0.7344
Epoch 16/100
313/313 [==============================] - 22s 70ms/step - loss: 0.5371 - accuracy: 0.7585 - val_loss: 0.5515 - val_accuracy: 0.7430
Epoch 17/100
313/313 [==============================] - 22s 69ms/step - loss: 0.5252 - accuracy: 0.7627 - val_loss: 0.5412 - val_accuracy: 0.7498
Epoch 18/100
313/313 [==============================] - 22s 69ms/step - loss: 0.5141 - accuracy: 0.7692 - val_loss: 0.5334 - val_accuracy: 0.7470
Epoch 19/100
313/313 [==============================] - 22s 70ms/step - loss: 0.5042 - accuracy: 0.7717 - val_loss: 0.5270 - val_accuracy: 0.7528
Epoch 20/100
313/313 [==============================] - 22s 69ms/step - loss: 0.4959 - accuracy: 0.7741 - val_loss: 0.5187 - val_accuracy: 0.7566
Epoch 21/100
313/313 [==============================] - 22s 69ms/step - loss: 0.4879 - accuracy: 0.7786 - val_loss: 0.5141 - val_accuracy: 0.7586
Epoch 22/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4816 - accuracy: 0.7813 - val_loss: 0.5056 - val_accuracy: 0.7614
Epoch 23/100
313/313 [==============================] - 22s 69ms/step - loss: 0.4761 - accuracy: 0.7830 - val_loss: 0.5036 - val_accuracy: 0.7618
Epoch 24/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4710 - accuracy: 0.7865 - val_loss: 0.5003 - val_accuracy: 0.7636
Epoch 25/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4659 - accuracy: 0.7896 - val_loss: 0.4949 - val_accuracy: 0.7666
Epoch 26/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4614 - accuracy: 0.7910 - val_loss: 0.4967 - val_accuracy: 0.7666
Epoch 27/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4582 - accuracy: 0.7937 - val_loss: 0.4902 - val_accuracy: 0.7706
Epoch 28/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4540 - accuracy: 0.7952 - val_loss: 0.4890 - val_accuracy: 0.7712
Epoch 29/100
313/313 [==============================] - 22s 71ms/step - loss: 0.4501 - accuracy: 0.7971 - val_loss: 0.4885 - val_accuracy: 0.7700
Epoch 30/100
313/313 [==============================] - 22s 71ms/step - loss: 0.4475 - accuracy: 0.7992 - val_loss: 0.4875 - val_accuracy: 0.7702
Epoch 31/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4446 - accuracy: 0.8001 - val_loss: 0.4858 - val_accuracy: 0.7712
Epoch 32/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4418 - accuracy: 0.8016 - val_loss: 0.4841 - val_accuracy: 0.7746
Epoch 33/100
313/313 [==============================] - 22s 71ms/step - loss: 0.4386 - accuracy: 0.8053 - val_loss: 0.4817 - val_accuracy: 0.7744
Epoch 34/100
313/313 [==============================] - 22s 71ms/step - loss: 0.4361 - accuracy: 0.8055 - val_loss: 0.4823 - val_accuracy: 0.7768
Epoch 35/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4334 - accuracy: 0.8063 - val_loss: 0.4835 - val_accuracy: 0.7768
Epoch 36/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4311 - accuracy: 0.8091 - val_loss: 0.4790 - val_accuracy: 0.7774
Epoch 37/100
313/313 [==============================] - 22s 71ms/step - loss: 0.4285 - accuracy: 0.8097 - val_loss: 0.4788 - val_accuracy: 0.7792
Epoch 38/100
313/313 [==============================] - 22s 71ms/step - loss: 0.4256 - accuracy: 0.8117 - val_loss: 0.4784 - val_accuracy: 0.7786
Epoch 39/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4238 - accuracy: 0.8123 - val_loss: 0.4765 - val_accuracy: 0.7790
Epoch 40/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4219 - accuracy: 0.8136 - val_loss: 0.4765 - val_accuracy: 0.7796
Epoch 41/100
313/313 [==============================] - 22s 71ms/step - loss: 0.4190 - accuracy: 0.8144 - val_loss: 0.4797 - val_accuracy: 0.7804
Epoch 42/100
313/313 [==============================] - 22s 70ms/step - loss: 0.4175 - accuracy: 0.8159 - val_loss: 0.4766 - val_accuracy: 0.7804
In [25]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'val'])
plt.show()

단어 임베딩을 사용하기

In [26]:
model2 = keras.Sequential()

model2.add(keras.layers.Embedding(500, 16, input_length=100))
model2.add(keras.layers.SimpleRNN(8))
model2.add(keras.layers.Dense(1, activation='sigmoid'))

model2.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 100, 16)           8000      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 8)                 200       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9         
=================================================================
Total params: 8,209
Trainable params: 8,209
Non-trainable params: 0
_________________________________________________________________
In [27]:
rmsprop = keras.optimizers.RMSprop(learning_rate=1e-4)
model2.compile(optimizer=rmsprop, loss='binary_crossentropy', 
               metrics=['accuracy'])

checkpoint_cb = keras.callbacks.ModelCheckpoint('best-embedding-model.h5', 
                                                save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=3,
                                                  restore_best_weights=True)

history = model2.fit(train_seq, train_target, epochs=100, batch_size=64,
                     validation_data=(val_seq, val_target),
                     callbacks=[checkpoint_cb, early_stopping_cb])
Epoch 1/100
313/313 [==============================] - 29s 91ms/step - loss: 0.6812 - accuracy: 0.5728 - val_loss: 0.6668 - val_accuracy: 0.6236
Epoch 2/100
313/313 [==============================] - 29s 92ms/step - loss: 0.6519 - accuracy: 0.6693 - val_loss: 0.6423 - val_accuracy: 0.6848
Epoch 3/100
313/313 [==============================] - 29s 93ms/step - loss: 0.6258 - accuracy: 0.7117 - val_loss: 0.6199 - val_accuracy: 0.7064
Epoch 4/100
313/313 [==============================] - 28s 91ms/step - loss: 0.6009 - accuracy: 0.7358 - val_loss: 0.5992 - val_accuracy: 0.7300
Epoch 5/100
313/313 [==============================] - 28s 91ms/step - loss: 0.5772 - accuracy: 0.7519 - val_loss: 0.5774 - val_accuracy: 0.7450
Epoch 6/100
313/313 [==============================] - 29s 91ms/step - loss: 0.5557 - accuracy: 0.7645 - val_loss: 0.5580 - val_accuracy: 0.7534
Epoch 7/100
313/313 [==============================] - 29s 91ms/step - loss: 0.5349 - accuracy: 0.7746 - val_loss: 0.5414 - val_accuracy: 0.7554
Epoch 8/100
313/313 [==============================] - 29s 92ms/step - loss: 0.5165 - accuracy: 0.7829 - val_loss: 0.5261 - val_accuracy: 0.7668
Epoch 9/100
313/313 [==============================] - 29s 92ms/step - loss: 0.5002 - accuracy: 0.7901 - val_loss: 0.5151 - val_accuracy: 0.7670
Epoch 10/100
313/313 [==============================] - 29s 92ms/step - loss: 0.4861 - accuracy: 0.7940 - val_loss: 0.5028 - val_accuracy: 0.7704
Epoch 11/100
313/313 [==============================] - 29s 91ms/step - loss: 0.4733 - accuracy: 0.7991 - val_loss: 0.4924 - val_accuracy: 0.7768
Epoch 12/100
313/313 [==============================] - 29s 93ms/step - loss: 0.4626 - accuracy: 0.8027 - val_loss: 0.4938 - val_accuracy: 0.7760
Epoch 13/100
313/313 [==============================] - 28s 90ms/step - loss: 0.4540 - accuracy: 0.8051 - val_loss: 0.4798 - val_accuracy: 0.7820
Epoch 14/100
313/313 [==============================] - 29s 92ms/step - loss: 0.4460 - accuracy: 0.8081 - val_loss: 0.4790 - val_accuracy: 0.7794
Epoch 15/100
313/313 [==============================] - 28s 91ms/step - loss: 0.4395 - accuracy: 0.8099 - val_loss: 0.4732 - val_accuracy: 0.7834
Epoch 16/100
313/313 [==============================] - 28s 89ms/step - loss: 0.4346 - accuracy: 0.8144 - val_loss: 0.4701 - val_accuracy: 0.7834
Epoch 17/100
313/313 [==============================] - 27s 88ms/step - loss: 0.4296 - accuracy: 0.8153 - val_loss: 0.4673 - val_accuracy: 0.7850
Epoch 18/100
313/313 [==============================] - 27s 87ms/step - loss: 0.4244 - accuracy: 0.8176 - val_loss: 0.4690 - val_accuracy: 0.7830
Epoch 19/100
313/313 [==============================] - 27s 88ms/step - loss: 0.4209 - accuracy: 0.8185 - val_loss: 0.4654 - val_accuracy: 0.7848
Epoch 20/100
313/313 [==============================] - 27s 88ms/step - loss: 0.4180 - accuracy: 0.8190 - val_loss: 0.4645 - val_accuracy: 0.7862
Epoch 21/100
313/313 [==============================] - 28s 89ms/step - loss: 0.4150 - accuracy: 0.8213 - val_loss: 0.4633 - val_accuracy: 0.7868
Epoch 22/100
313/313 [==============================] - 28s 88ms/step - loss: 0.4110 - accuracy: 0.8217 - val_loss: 0.4667 - val_accuracy: 0.7798
Epoch 23/100
313/313 [==============================] - 28s 89ms/step - loss: 0.4089 - accuracy: 0.8231 - val_loss: 0.4656 - val_accuracy: 0.7820
Epoch 24/100
313/313 [==============================] - 28s 89ms/step - loss: 0.4060 - accuracy: 0.8234 - val_loss: 0.4643 - val_accuracy: 0.7878
In [28]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'val'])
plt.show()