#!/usr/bin/env python # coding: utf-8 # This notebook was run using my default environment. # # This notebook plays with amazon review data - http://jmcauley.ucsd.edu/data/amazon/. I have only downloaded the electronics reviews and I have not downloaded the metadata. Maybe in the future. # In[ ]: import gzip import json def parse(path): g = gzip.open(path, 'r') for l in g: yield json.dumps(eval(l)) path = './reviews_Electronics_5.json.gz' f = open("output.strict", 'w') for l in parse(path): f.write(l + '\n') from nltk import word_tokenize review_dict = json.loads(eval(l)) review_text = review_dict['reviewText'] tok_review = word_tokenize(review_text) len(tok_review) # In[ ]: from keras.preprocessing.text import Tokenizer import gzip MAX_SEQUENCE_LENGTH = 1000 MAX_NB_WORDS = 40000 EMBEDDING_DIM = 100 VALIDATION_SPLIT = 0.2 def generator_review_parse(path): g = gzip.open(path, 'r') for l in g: review_dict = eval(l) yield review_dict['reviewText'] tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) #tokenizer.fit_on_texts(all_text) path = './reviews_Electronics_5.json.gz' tokenizer.fit_on_texts(generator_review_parse(path)) sequences = tokenizer.texts_to_sequences_generator(generator_review_parse(path)) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) # In[ ]: import os import numpy as np GLOVE_DIR = '/home/dan-laptop/github/ulysses/glove.6B/' embeddings_index = {} f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(embeddings_index)) # In[ ]: #not sure why but I had to reduce the embedding dim to 100 here... embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector # In[ ]: from keras.layers import Embedding embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) # In[ ]: def generator_modelData(path,batch_size=1,token_model=tokenizer): g = gzip.open(path, 'r') count = 0 for l in g: if count == 0: reviews, scores = [], [] review_dict = eval(l) temp_review = np.zeros((MAX_SEQUENCE_LENGTH,)) temp_r = token_model.texts_to_sequences(review_dict['reviewText']) temp_r = [x[0] for x in temp_r if len(x) > 0] if len(temp_r) > MAX_SEQUENCE_LENGTH: temp_review = temp_r[:MAX_SEQUENCE_LENGTH] elif len(temp_r) == 0: continue else: temp_review[-len(temp_r):] = np.squeeze(temp_r) temp_review = np.reshape(temp_review,(1,1000)) temp_score = np.zeros((5)) temp_score[int(review_dict['overall'])-1] = 1 if len(temp_score) == 0: continue scores.append(np.reshape(temp_score,(1,5))) reviews.append(temp_review) count += 1 if count == batch_size: yield (reviews,scores) count = 0 #test = next(generator_modelData(path)) # In[ ]: from keras.layers import Dense, Input, Flatten from keras.layers import Conv1D, MaxPooling1D from keras.models import Model sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') embedded_sequences = embedding_layer(sequence_input) x = Conv1D(128, 5, activation='relu')(embedded_sequences) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(5)(x) x = Conv1D(128, 5, activation='relu')(x) x = MaxPooling1D(35)(x) # global max pooling x = Flatten()(x) x = Dense(128, activation='relu')(x) preds = Dense(5, activation='softmax')(x) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) # happy learning! trials_per_epoch = 5000 model.fit_generator(generator_modelData(path), trials_per_epoch, nb_epoch=5, validation_data=generator_modelData(path),nb_val_samples=1280) model.save_weights('./amazon_ratings_convnet.h5') # In[ ]: