#!/usr/bin/env python
# coding: utf-8

# In[5]:


from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb


# # IMDB Movie reviews sentiment classification - Keras (Backend: TensorFlow)

# ## IMDB Movie reviews Data
# 
# Kerasの提供するデータセットを利用：  
# http://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

# In[6]:


max_features = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)

print 'train sequences: {0}'.format(len(X_train))
print 'test sequences: {0}'.format(len(X_test))


# In[8]:


# データの中身
# 各レビューはWord Indexのシーケンスとして符号化されている
# Indexは頻度に基づき符号化されているため，例えば3番めに頻度の高いWordにはIndex「3」が付与される

X_train[0][:10]


# In[10]:


# テキストは最長100におさめる
maxlen = 100

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print 'X_train shape: {0}'.format(X_train.shape)
print 'X_test shape: {0}'.format(X_test.shape)


# ## Implementing

# In[11]:


model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))


# ## Training

# In[12]:


model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")


# In[13]:


batch_size = 32
model.fit(X_train, y_train, batch_size=32, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True)


# ## Evaluating

# In[14]:


print(model.evaluate(X_test, y_test, batch_size=32, show_accuracy=True))


#