This notebook was run using my default environment.
This notebook plays with amazon review data - http://jmcauley.ucsd.edu/data/amazon/. I have only downloaded the electronics reviews and I have not downloaded the metadata. Maybe in the future.
import gzip
import json
def parse(path):
g = gzip.open(path, 'r')
for l in g:
yield json.dumps(eval(l))
path = './reviews_Electronics_5.json.gz'
f = open("output.strict", 'w')
for l in parse(path):
f.write(l + '\n')
from nltk import word_tokenize
review_dict = json.loads(eval(l))
review_text = review_dict['reviewText']
tok_review = word_tokenize(review_text)
len(tok_review)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-1-2b4d981af6ed> in <module>() 9 path = './reviews_Electronics_5.json.gz' 10 f = open("output.strict", 'w') ---> 11 for l in parse(path): 12 f.write(l + '\n') 13 <ipython-input-1-2b4d981af6ed> in parse(path) 4 def parse(path): 5 g = gzip.open(path, 'r') ----> 6 for l in g: 7 yield json.dumps(eval(l)) 8 /home/dan-laptop/anaconda3/lib/python3.5/gzip.py in readline(self, size) 370 def readline(self, size=-1): 371 self._check_not_closed() --> 372 return self._buffer.readline(size) 373 374 KeyboardInterrupt:
from keras.preprocessing.text import Tokenizer
import gzip
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 40000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
def generator_review_parse(path):
g = gzip.open(path, 'r')
for l in g:
review_dict = eval(l)
yield review_dict['reviewText']
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
#tokenizer.fit_on_texts(all_text)
path = './reviews_Electronics_5.json.gz'
tokenizer.fit_on_texts(generator_review_parse(path))
sequences = tokenizer.texts_to_sequences_generator(generator_review_parse(path))
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
Using Theano backend.
import os
import numpy as np
GLOVE_DIR = '/home/dan-laptop/github/ulysses/glove.6B/'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
#not sure why but I had to reduce the embedding dim to 100 here...
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
from keras.layers import Embedding
embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
def generator_modelData(path,batch_size=1,token_model=tokenizer):
g = gzip.open(path, 'r')
count = 0
for l in g:
if count == 0: reviews, scores = [], []
review_dict = eval(l)
temp_review = np.zeros((MAX_SEQUENCE_LENGTH,))
temp_r = token_model.texts_to_sequences(review_dict['reviewText'])
temp_r = [x[0] for x in temp_r if len(x) > 0]
if len(temp_r) > MAX_SEQUENCE_LENGTH:
temp_review = temp_r[:MAX_SEQUENCE_LENGTH]
elif len(temp_r) == 0:
continue
else:
temp_review[-len(temp_r):] = np.squeeze(temp_r)
temp_review = np.reshape(temp_review,(1,1000))
temp_score = np.zeros((5))
temp_score[int(review_dict['overall'])-1] = 1
if len(temp_score) == 0: continue
scores.append(np.reshape(temp_score,(1,5)))
reviews.append(temp_review)
count += 1
if count == batch_size:
yield (reviews,scores)
count = 0
#test = next(generator_modelData(path))
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x) # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(5, activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
# happy learning!
trials_per_epoch = 5000
model.fit_generator(generator_modelData(path), trials_per_epoch, nb_epoch=5,
validation_data=generator_modelData(path),nb_val_samples=1280)
model.save_weights('./amazon_ratings_convnet.h5')