In [5]:
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb

IMDB Movie reviews sentiment classification - Keras (Backend: TensorFlow)

IMDB Movie reviews Data

Kerasの提供するデータセットを利用:
http://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

In [6]:
max_features = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)

print 'train sequences: {0}'.format(len(X_train))
print 'test sequences: {0}'.format(len(X_test))
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.pkl
33218560/33213513 [==============================] - 11s    
train sequences: 20000
test sequences: 5000
In [8]:
# データの中身
# 各レビューはWord Indexのシーケンスとして符号化されている
# Indexは頻度に基づき符号化されているため,例えば3番めに頻度の高いWordにはIndex「3」が付与される

X_train[0][:10]
Out[8]:
[1, 20, 28, 716, 48, 495, 79, 27, 493, 8]
In [10]:
# テキストは最長100におさめる
maxlen = 100

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print 'X_train shape: {0}'.format(X_train.shape)
print 'X_test shape: {0}'.format(X_test.shape)
X_train shape: (20000, 100)
X_test shape: (5000, 100)

Implementing

In [11]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

Training

In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")
/Users/amacbee/Dropbox/github/keras-conversational/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
  from scan_perform.scan_perform import *
In [13]:
batch_size = 32
model.fit(X_train, y_train, batch_size=32, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True)
Train on 20000 samples, validate on 5000 samples
Epoch 1/3
20000/20000 [==============================] - 301s - loss: 0.4845 - acc: 0.7713 - val_loss: 0.4116 - val_acc: 0.8266
Epoch 2/3
20000/20000 [==============================] - 299s - loss: 0.2720 - acc: 0.8931 - val_loss: 0.3630 - val_acc: 0.8420
Epoch 3/3
20000/20000 [==============================] - 315s - loss: 0.1756 - acc: 0.9372 - val_loss: 0.4106 - val_acc: 0.8370
Out[13]:
<keras.callbacks.History at 0x10bdc7f90>

Evaluating

In [14]:
print(model.evaluate(X_test, y_test, batch_size=32, show_accuracy=True))
5000/5000 [==============================] - 12s    
[0.41059308743695982, 0.83699999999999997]