#!/usr/bin/env python # coding: utf-8 # # Instructions # 1. Go to https://colab.research.google.com and choose the \"Upload\" option to upload this notebook file. # 1. In the Edit menu, choose \"Notebook Settings\" and then set the \"Hardware Accelerator\" dropdown to GPU. # 1. Read through the code in the following sections: # * [IMDB Dataset](#scrollTo=NPa7eLiaaof0) # * [Define Model](#scrollTo=ihsQ5xEoaog6) # * [Train Model](#scrollTo=OlXYR7KNaohE) # * [Assess Model](#scrollTo=LkS3AAQraohK) # 1. Complete at least one of these exercises. Remember to keep notes about what you do! # * [Exercise Option #1 - Standard Difficulty](#scrollTo=VU4-GCUxaohS) # * [Exercise Option #2 - Advanced Difficulty](#scrollTo=VU4-GCUxaohS) # ## Documentation/Sources # * [https://radimrehurek.com/gensim/models/word2vec.html](https://radimrehurek.com/gensim/models/word2vec.html) for more information about how to use gensim word2vec in general # * _Blog post has been removed_ [https://codekansas.github.io/blog/2016/gensim.html](https://codekansas.github.io/blog/2016/gensim.html) for information about using it to create embedding layers for neural networks. # * [https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/](https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/) for information on sequence classification with keras # * [https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) for using pre-trained embeddings with keras (though the syntax they use for the model layers is different than most other tutorials). # * [https://keras.io/](https://keras.io/) Keras API documentation # In[ ]: # upgrade tensorflow to tensorflow 2 get_ipython().run_line_magic('tensorflow_version', '2.x') # display matplotlib plots get_ipython().run_line_magic('matplotlib', 'inline') # # IMDB Dataset # The [IMDB dataset](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) consists of movie reviews that have been marked as positive or negative. (There is also a built-in dataset of [Reuters newswires](https://keras.io/datasets/#reuters-newswire-topics-classification) that have been classified by topic.) # In[ ]: from keras.datasets import imdb (x_train, y_train), (x_test, y_test) = imdb.load_data() # It looks like our labels consist of 0 or 1, which makes sense for positive and negative. # In[ ]: print(y_train[0:9]) print(max(y_train)) print(min(y_train)) # But x is a bit more trouble. The words have already been converted to numbers -- numbers that have nothing to do with the word embeddings we spent time learning! # In[ ]: x_train[0] # Looking at the help page for imdb, it appears there is a way to get the word back. Phew. # In[ ]: help(imdb) # In[ ]: imdb_offset = 3 imdb_map = dict((index + imdb_offset, word) for (word, index) in imdb.get_word_index().items()) imdb_map[0] = 'PADDING' imdb_map[1] = 'START' imdb_map[2] = 'UNKNOWN' # The knowledge about the initial indices and offset came from [this stack overflow post](https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset) after I got gibberish when I tried to translate the first review, below. It looks coherent now! # In[ ]: ' '.join([imdb_map[word_index] for word_index in x_train[0]]) # For this exercise, we're going to keep all inputs the same length (we'll see how to do variable-length later). This means we need to choose a maximum length for the review, cutting off longer ones and adding padding to shorter ones. What should we make the length? Let's understand our data. # In[ ]: lengths = [len(review) for review in x_train + x_test] print('Longest review: {} Shortest review: {}'.format(max(lengths), min(lengths))) # 2697 words! Wow. Well, let's see how many reviews would get cut off at a particular cutoff. # In[ ]: cutoff = 500 print('{} reviews out of {} are over {}.'.format( sum([1 for length in lengths if length > cutoff]), len(lengths), cutoff)) # In[ ]: from keras.preprocessing import sequence x_train_padded = sequence.pad_sequences(x_train, maxlen=cutoff) x_test_padded = sequence.pad_sequences(x_test, maxlen=cutoff) # # Define Model # In[ ]: from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Conv1D, Dense, Flatten from tensorflow import test from tensorflow import device # The embedding layer here learns the 100-dimensional vector embedding within the overall classification problem training. That is usually what we want, unless we have a bunch of un-tagged data that could be used to train word vectors but not a classification model. # In[ ]: not_pretrained_model = Sequential() not_pretrained_model.add(Embedding(input_dim=len(imdb_map), output_dim=100, input_length=cutoff)) not_pretrained_model.add(Conv1D(filters=32, kernel_size=5, activation='relu')) not_pretrained_model.add(Conv1D(filters=32, kernel_size=5, activation='relu')) not_pretrained_model.add(Conv1D(filters=32, kernel_size=5, activation='relu')) not_pretrained_model.add(Conv1D(filters=32, kernel_size=5, activation='relu')) not_pretrained_model.add(Conv1D(filters=32, kernel_size=5, activation='relu')) not_pretrained_model.add(Flatten()) not_pretrained_model.add(Dense(units=128, activation='relu')) not_pretrained_model.add(Dense(units=1, activation='sigmoid')) # because at the end, we want one yes/no answer not_pretrained_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) # # Train Model # In[ ]: # Train using GPU acceleration # (see https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=Y04m-jvKRDsJ) device_name = test.gpu_device_name() if device_name != '/device:GPU:0': print( '\n\nThis error most likely means that this notebook is not ' 'configured to use a GPU. Change this in Notebook Settings via the ' 'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n') raise SystemError('GPU device not found') with device('/device:GPU:0'): not_pretrained_model.fit(x_train_padded, y_train, epochs=1, batch_size=64) # # Assess Model # In[ ]: with device('/device:GPU:0'): not_pretrained_scores = not_pretrained_model.evaluate(x_test_padded, y_test) print('loss: {} accuracy: {}'.format(*not_pretrained_scores)) # ## For any model that you try in these exercises, take notes about the performance you see and anything you notice about the differences between the models. # # ## Exercise Option #1 - Standard Difficulty # Try changing different hyperparameters of the not_pretrained model. Keep notes on how the performance changes. # # ## Exercise Option #2 - Advanced Difficulty # Make a model for the reuters classification problem, using the not_pretrained model above as a reference. # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: