In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb')
In [0]:
def get_idx2word(_index_from=3):
  word2idx = tf.keras.datasets.imdb.get_word_index()
  word2idx = {k:(v+_index_from) for k,v in word2idx.items()}
  word2idx['<pad>'] = 0
  word2idx['<start>'] = 1
  word2idx['<unk>'] = 2
  idx2word = {idx: w for w, idx in word2idx.items()}
  return idx2word
In [3]:
import tensorflow as tf

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=20000)
idx2word = get_idx2word()

with open('train.txt', 'w') as f:
  for x, y in zip(X_train, y_train):
    text = ' '.join([idx2word[i] for i in x[1:]])
    f.write(f'__label__{y} {text}\n')

with open('test.txt', 'w') as f:
  for x, y in zip(X_test, y_test):
    text = ' '.join([idx2word[i] for i in x[1:]])
    f.write(f'__label__{y} {text}\n')
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 0s 0us/step
In [4]:
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip -q v0.9.2.zip
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb/fastText-0.9.2')
!make
--2020-06-02 07:05:49--  https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/facebookresearch/fastText/zip/v0.9.2 [following]
--2020-06-02 07:05:50--  https://codeload.github.com/facebookresearch/fastText/zip/v0.9.2
Resolving codeload.github.com (codeload.github.com)... 140.82.112.9
Connecting to codeload.github.com (codeload.github.com)|140.82.112.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘v0.9.2.zip’

v0.9.2.zip              [     <=>            ]   4.17M  2.61MB/s    in 1.6s    

2020-06-02 07:05:52 (2.61 MB/s) - ‘v0.9.2.zip’ saved [4369852]

c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/args.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/autotune.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/matrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/dictionary.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/loss.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/productquantizer.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/densematrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/quantmatrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/vector.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/model.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/utils.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/meter.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/fasttext.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG args.o autotune.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o src/main.cc -o fasttext
In [5]:
!./fasttext supervised -input ../train.txt -output model -epoch 30 -autotune-validation ../test.txt -autotune-duration 7200
Warning : epoch is manually set to a specific value. It will not be automatically optimized.
Progress:   8.5% Trials:    5 Best score:  0.880800 ETA:   1h49m46stcmalloc: large alloc 1770201088 bytes == 0x555690670000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  22.9% Trials:   14 Best score:  0.893640 ETA:   1h32m29stcmalloc: large alloc 5651283968 bytes == 0x555690670000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  28.9% Trials:   16 Best score:  0.895240 ETA:   1h25m20stcmalloc: large alloc 4408803328 bytes == 0x555690670000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  32.1% Trials:   17 Best score:  0.895760 ETA:   1h21m30stcmalloc: large alloc 2505383936 bytes == 0x555797300000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  44.7% Trials:   19 Best score:  0.895760 ETA:   1h 6m23stcmalloc: large alloc 3333087232 bytes == 0x5556da4e0000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  47.9% Trials:   20 Best score:  0.895760 ETA:   1h 2m34stcmalloc: large alloc 2095931392 bytes == 0x5557a0f8e000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  50.0% Trials:   22 Best score:  0.900280 ETA:   1h 0m 0stcmalloc: large alloc 4606476288 bytes == 0x555690650000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  55.1% Trials:   23 Best score:  0.900280 ETA:   0h53m55stcmalloc: large alloc 6773522432 bytes == 0x55584b41e000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  63.2% Trials:   25 Best score:  0.900280 ETA:   0h44m10stcmalloc: large alloc 7503577088 bytes == 0x55584b41e000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress:  95.6% Trials:   31 Best score:  0.900280 ETA:   0h 5m15stcmalloc: large alloc 3170074624 bytes == 0x555690650000 @  0x7f807bcb0887 0x55564f52ffed 0x55564f53e72e 0x55564f54650c 0x55564f51222f 0x55564f54da84 0x55564f503887 0x7f807ad4db97 0x55564f503b4a
Progress: 100.0% Trials:   33 Best score:  0.900280 ETA:   0h 0m 0s
Training again with best arguments
Read 5M words
Number of words:  19998
Number of labels: 2
Progress: 100.0% words/sec/thread:  116092 lr:  0.000000 avg.loss:  0.079526 ETA:   0h 0m 0s
In [6]:
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb')
!git clone https://github.com/facebookresearch/fastText.git
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb/fastText')
!pip install .
Cloning into 'fastText'...
remote: Enumerating objects: 14, done.
remote: Counting objects: 100% (14/14), done.
remote: Compressing objects: 100% (12/12), done.
remote: Total 3840 (delta 1), reused 7 (delta 1), pack-reused 3826
Receiving objects: 100% (3840/3840), 8.21 MiB | 4.93 MiB/s, done.
Resolving deltas: 100% (2410/2410), done.
Checking out files: 100% (526/526), done.
Processing /content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb/fastText
Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (2.5.0)
Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (47.1.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (1.18.4)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... done
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3009603 sha256=0fc119ffe4e2f1d2cf7ca1d15e7adb1a6795c64baa25ad486aebb1773a1e6236
  Stored in directory: /tmp/pip-ephem-wheel-cache-b3bj_ovz/wheels/bb/e4/28/6ef7d95ecd7710f3cef57c336a9ee2673283580cb21aa81362
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2
In [7]:
import numpy as np

from fasttext import load_model

os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb')
classifier = load_model("fastText-0.9.2/model.bin")  
texts = [' '.join([idx2word[i] for i in x]) for x in X_test]

y_preds = classifier.predict(texts)
label2label = {'__label__0': 0, '__label__1':1}
y_preds = [label2label[l[0]] for l in y_preds[0]]

print('Testing Accuracy: {:.3f}'.format((np.array(y_preds)==y_test).mean()))
Testing Accuracy: 0.901