In [0]:
"""
We are running these lines because we are operating on Google Colab
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb')
In [0]:
def get_idx2word(_index_from=3):
  word2idx = tf.keras.datasets.imdb.get_word_index()
  word2idx = {k:(v+_index_from) for k,v in word2idx.items()}
  word2idx["<pad>"] = 0
  word2idx["<start>"] = 1
  word2idx["<unk>"] = 2
  idx2word = {idx: w for w, idx in word2idx.items()}
  return idx2word
In [0]:
import tensorflow as tf

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=20000)
idx2word = get_idx2word()

with open('train.txt', 'w') as f:
  for x, y in zip(X_train, y_train):
    text = ' '.join([idx2word[i] for i in x[1:]])
    f.write(f'__label__{y} {text}\n')
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 0s 0us/step
In [0]:
!wget https://github.com/facebookresearch/fastText/archive/v0.2.0.zip
!unzip -q v0.2.0.zip
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb/fastText-0.2.0')
!make
--2019-04-19 07:41:08--  https://github.com/facebookresearch/fastText/archive/v0.2.0.zip
Resolving github.com (github.com)... 192.30.253.112, 192.30.253.113
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/facebookresearch/fastText/zip/v0.2.0 [following]
--2019-04-19 07:41:08--  https://codeload.github.com/facebookresearch/fastText/zip/v0.2.0
Resolving codeload.github.com (codeload.github.com)... 192.30.253.121, 192.30.253.120
Connecting to codeload.github.com (codeload.github.com)|192.30.253.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘v0.2.0.zip’

v0.2.0.zip              [  <=>               ]   4.10M  13.2MB/s    in 0.3s    

2019-04-19 07:41:08 (13.2 MB/s) - ‘v0.2.0.zip’ saved [4304799]

c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/args.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/dictionary.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/productquantizer.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/matrix.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/qmatrix.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/vector.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/model.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/utils.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/meter.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/fasttext.cc
src/fasttext.cc: In member function ‘void fasttext::FastText::quantize(const fasttext::Args&)’:
src/fasttext.cc:302:45: warning: std::vector<int> fasttext::FastText::selectEmbeddings(int32_t) const’ is deprecated: selectEmbeddings is being deprecated. [-Wdeprecated-declarations]
     auto idx = selectEmbeddings(qargs.cutoff);
                                             ^
src/fasttext.cc:279:22: note: declared here
 std::vector<int32_t> FastText::selectEmbeddings(int32_t cutoff) const {
                      ^~~~~~~~
src/fasttext.cc: In member function ‘void fasttext::FastText::lazyComputeWordVectors()’:
src/fasttext.cc:531:40: warning: void fasttext::FastText::precomputeWordVectors(fasttext::Matrix&)’ is deprecated: precomputeWordVectors is being deprecated. [-Wdeprecated-declarations]
     precomputeWordVectors(*wordVectors_);
                                        ^
src/fasttext.cc:514:6: note: declared here
 void FastText::precomputeWordVectors(Matrix& wordVectors) {
      ^~~~~~~~
src/fasttext.cc: In member function ‘void fasttext::FastText::trainThread(int32_t)’:
src/fasttext.cc:650:41: warning: void fasttext::FastText::supervised(fasttext::Model&, fasttext::real, const std::vector<int>&, const std::vector<int>&)’ is deprecated: supervised is being deprecated. [-Wdeprecated-declarations]
       supervised(model, lr, line, labels);
                                         ^
src/fasttext.cc:338:6: note: declared here
 void FastText::supervised(
      ^~~~~~~~
src/fasttext.cc:653:27: warning: void fasttext::FastText::cbow(fasttext::Model&, fasttext::real, const std::vector<int>&)’ is deprecated: cbow is being deprecated. [-Wdeprecated-declarations]
       cbow(model, lr, line);
                           ^
src/fasttext.cc:355:6: note: declared here
 void FastText::cbow(Model& model, real lr, const std::vector<int32_t>& line) {
      ^~~~~~~~
src/fasttext.cc:656:31: warning: void fasttext::FastText::skipgram(fasttext::Model&, fasttext::real, const std::vector<int>&)’ is deprecated: skipgram is being deprecated. [-Wdeprecated-declarations]
       skipgram(model, lr, line);
                               ^
src/fasttext.cc:371:6: note: declared here
 void FastText::skipgram(
      ^~~~~~~~
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops args.o dictionary.o productquantizer.o matrix.o qmatrix.o vector.o model.o utils.o meter.o fasttext.o src/main.cc -o fasttext
In [0]:
!./fasttext supervised -input ../train.txt -output model -epoch 30 -wordNgrams 2
Read 5M words
Number of words:  19998
Number of labels: 2
Progress: 100.0% words/sec/thread:  210378 lr:  0.000000 loss:  0.118720 ETA:   0h 0m
In [0]:
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb')
!git clone https://github.com/facebookresearch/fastText.git
os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb/fastText')
!pip install .
Cloning into 'fastText'...
remote: Enumerating objects: 71, done.
remote: Counting objects: 100% (71/71), done.
remote: Compressing objects: 100% (55/55), done.
remote: Total 3192 (delta 23), reused 37 (delta 5), pack-reused 3121
Receiving objects: 100% (3192/3192), 7.90 MiB | 8.27 MiB/s, done.
Resolving deltas: 100% (1989/1989), done.
Checking out files: 100% (508/508), done.
Processing /content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb/fastText
Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.8.22) (2.2.4)
Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.8.22) (40.9.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from fasttext==0.8.22) (1.16.2)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... done
  Stored in directory: /tmp/pip-ephem-wheel-cache-hsiyy8bx/wheels/bb/e4/28/6ef7d95ecd7710f3cef57c336a9ee2673283580cb21aa81362
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.8.22
In [0]:
import numpy as np

from fastText import load_model

os.chdir('/content/gdrive/My Drive/finch/framework/official_fasttext/text_classification/imdb')
classifier = load_model("fastText-0.2.0/model.bin")  
texts = [' '.join([idx2word[i] for i in x]) for x in X_test]

y_preds = classifier.predict(texts)
label2label = {'__label__0': 0, '__label__1':1}
y_preds = [label2label[l[0]] for l in y_preds[0]]

print('Testing Accuracy: {:.3f}'.format((np.array(y_preds)==y_test).mean()))
Testing Accuracy: 0.898