In [5]:

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb

IMDB Movie reviews sentiment classification - Keras (Backend: TensorFlow)¶

IMDB Movie reviews Data¶

Kerasの提供するデータセットを利用：
http://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

In [6]:

max_features = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)

print 'train sequences: {0}'.format(len(X_train))
print 'test sequences: {0}'.format(len(X_test))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.pkl
33218560/33213513 [==============================] - 11s    
train sequences: 20000
test sequences: 5000

In [8]:

# データの中身
# 各レビューはWord Indexのシーケンスとして符号化されている
# Indexは頻度に基づき符号化されているため，例えば3番めに頻度の高いWordにはIndex「3」が付与される

X_train[0][:10]

Out[8]:

[1, 20, 28, 716, 48, 495, 79, 27, 493, 8]

In [10]:

# テキストは最長100におさめる
maxlen = 100

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print 'X_train shape: {0}'.format(X_train.shape)
print 'X_test shape: {0}'.format(X_test.shape)

X_train shape: (20000, 100)
X_test shape: (5000, 100)

Implementing¶

In [11]:

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

Training¶

In [12]:

model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")

/Users/amacbee/Dropbox/github/keras-conversational/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
  from scan_perform.scan_perform import *

In [13]:

batch_size = 32
model.fit(X_train, y_train, batch_size=32, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True)

Train on 20000 samples, validate on 5000 samples
Epoch 1/3
20000/20000 [==============================] - 301s - loss: 0.4845 - acc: 0.7713 - val_loss: 0.4116 - val_acc: 0.8266
Epoch 2/3
20000/20000 [==============================] - 299s - loss: 0.2720 - acc: 0.8931 - val_loss: 0.3630 - val_acc: 0.8420
Epoch 3/3
20000/20000 [==============================] - 315s - loss: 0.1756 - acc: 0.9372 - val_loss: 0.4106 - val_acc: 0.8370

Out[13]:

<keras.callbacks.History at 0x10bdc7f90>

Evaluating¶

In [14]:

print(model.evaluate(X_test, y_test, batch_size=32, show_accuracy=True))

5000/5000 [==============================] - 12s    
[0.41059308743695982, 0.83699999999999997]