In [0]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need these
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/imdb/data')
In [0]:
!pip install tf-nightly-2.0-preview
Collecting tf-nightly-2.0-preview
  Downloading https://files.pythonhosted.org/packages/5c/49/3e023e9b87c0c2e403b36c1aced3e9ab1a750da81456ca0cf2bafd4ff09c/tf_nightly_2.0_preview-2.0.0.dev20190306-cp36-cp36m-manylinux1_x86_64.whl (79.7MB)
    100% |████████████████████████████████| 79.7MB 348kB/s 
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.11.0)
Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (3.6.1)
Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.1.0)
Collecting google-pasta>=0.1.2 (from tf-nightly-2.0-preview)
  Downloading https://files.pythonhosted.org/packages/8c/96/adbd4eafe72ce9b5ca6f168fbf109386e1b601f7c59926a11e9d7b7a5b44/google_pasta-0.1.4-py3-none-any.whl (51kB)
    100% |████████████████████████████████| 61kB 22.5MB/s 
Collecting tensorflow-estimator-2.0-preview (from tf-nightly-2.0-preview)
  Downloading https://files.pythonhosted.org/packages/d2/e0/2a8105005a9f250e46317e582f87a5175a25c63832dfd67e4ca15d0659bd/tensorflow_estimator_2.0_preview-1.14.0.dev2019030600-py2.py3-none-any.whl (351kB)
    100% |████████████████████████████████| 358kB 21.0MB/s 
Collecting tb-nightly<1.15.0a0,>=1.14.0a0 (from tf-nightly-2.0-preview)
  Downloading https://files.pythonhosted.org/packages/e5/5e/79a66b54ddec968c9acd648a63213b236a7bb9348979944c234c8959da4d/tb_nightly-1.14.0a20190306-py3-none-any.whl (3.0MB)
    100% |████████████████████████████████| 3.0MB 11.7MB/s 
Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.0.9)
Requirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.2.2)
Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.7.0)
Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.33.1)
Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.15.0)
Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.14.6)
Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.7.1)
Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.0.7)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.6.1->tf-nightly-2.0-preview) (40.8.0)
Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.15.0a0,>=1.14.0a0->tf-nightly-2.0-preview) (0.14.1)
Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.15.0a0,>=1.14.0a0->tf-nightly-2.0-preview) (3.0.1)
Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.6->tf-nightly-2.0-preview) (2.8.0)
Installing collected packages: google-pasta, tensorflow-estimator-2.0-preview, tb-nightly, tf-nightly-2.0-preview
Successfully installed google-pasta-0.1.4 tb-nightly-1.14.0a20190306 tensorflow-estimator-2.0-preview-1.14.0.dev2019030600 tf-nightly-2.0-preview-2.0.0.dev20190306
In [0]:
import tensorflow as tf
import numpy as np

from collections import Counter
from pathlib import Path
from tqdm import tqdm

print('TensorFlow Version:', tf.__version__)
TensorFlow Version: 2.0.0-dev20190306

Make Data

In [0]:
"""
sort texts (and labels) according to the length of text
"""
def sort_by_len(x, y):
    x, y = np.asarray(x), np.asarray(y)
    idx = sorted(range(len(x)), key=lambda i: len(x[i]))
    return x[idx], y[idx]
In [0]:
_word2idx = tf.keras.datasets.imdb.get_word_index()
word2idx = {w: i+3 for w, i in _word2idx.items()}
word2idx['<pad>'] = 0
word2idx['<start>'] = 1
word2idx['<unk>'] = 2
idx2word = {i: w for w, i in word2idx.items()}
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 0s 0us/step
In [0]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

x_train, y_train = sort_by_len(x_train, y_train)
x_test, y_test = sort_by_len(x_test, y_test)

def write_file(f_path, xs, ys):
  with open(f_path, 'w') as f:
      for x, y in zip(xs, ys):
          f.write(str(y)+'\t'+' '.join([idx2word[i] for i in x][1:])+'\n')

write_file('../data/train.txt', x_train, y_train)
write_file('../data/test.txt', x_test, y_test)
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 0s 0us/step

Make Vocabulary

In [0]:
counter = Counter()
with open('../data/train.txt') as f:
  for line in f:
    line = line.rstrip()
    label, words = line.split('\t')
    words = words.split(' ')
    counter.update(words)

words = ['<pad>'] + [w for w, freq in counter.most_common() if freq >= 10]
print('Vocab Size:', len(words))

Path('../vocab').mkdir(exist_ok=True)

with open('../vocab/word.txt', 'w') as f:
  for w in words:
    f.write(w+'\n')
Vocab Size: 20598

Make Pretrained Embedding

In [0]:
word2idx = {}
with open('../vocab/word.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip()
    word2idx[line] = i
In [0]:
embedding = np.zeros((len(word2idx)+1, 300)) # + 1 for unknown word

with open('../data/glove.840B.300d.txt') as f:
  count = 0
  for i, line in enumerate(f):
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in word2idx:
      count += 1
      embedding[word2idx[word]] = np.asarray(vec, dtype='float32')
      
print("[%d / %d] words have found pre-trained values"%(count, len(word2idx)))
np.save('../vocab/word.npy', embedding)
print('Saved ../vocab/word.npy')
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
[19487 / 20598] words have found pre-trained values
Saved ../vocab/word.npy