from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb
Kerasの提供するデータセットを利用:
http://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
max_features = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)
print 'train sequences: {0}'.format(len(X_train))
print 'test sequences: {0}'.format(len(X_test))
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.pkl 33218560/33213513 [==============================] - 11s train sequences: 20000 test sequences: 5000
# データの中身
# 各レビューはWord Indexのシーケンスとして符号化されている
# Indexは頻度に基づき符号化されているため,例えば3番めに頻度の高いWordにはIndex「3」が付与される
X_train[0][:10]
[1, 20, 28, 716, 48, 495, 79, 27, 493, 8]
# テキストは最長100におさめる
maxlen = 100
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print 'X_train shape: {0}'.format(X_train.shape)
print 'X_test shape: {0}'.format(X_test.shape)
X_train shape: (20000, 100) X_test shape: (5000, 100)
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")
/Users/amacbee/Dropbox/github/keras-conversational/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility from scan_perform.scan_perform import *
batch_size = 32
model.fit(X_train, y_train, batch_size=32, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True)
Train on 20000 samples, validate on 5000 samples Epoch 1/3 20000/20000 [==============================] - 301s - loss: 0.4845 - acc: 0.7713 - val_loss: 0.4116 - val_acc: 0.8266 Epoch 2/3 20000/20000 [==============================] - 299s - loss: 0.2720 - acc: 0.8931 - val_loss: 0.3630 - val_acc: 0.8420 Epoch 3/3 20000/20000 [==============================] - 315s - loss: 0.1756 - acc: 0.9372 - val_loss: 0.4106 - val_acc: 0.8370
<keras.callbacks.History at 0x10bdc7f90>
print(model.evaluate(X_test, y_test, batch_size=32, show_accuracy=True))
5000/5000 [==============================] - 12s [0.41059308743695982, 0.83699999999999997]