In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/clue/main')
Mounted at /content/gdrive
In [2]:
import json

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
In [3]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx


def get_data():
  x_train = []
  y_train = []
  x_test = []
  y_test = []

  with open('../data/train.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_train.append(''.join(list(text)))
      y_train.append(label2idx[line['label']])

  with open('../data/test.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_test.append(''.join(list(text)))
      y_test.append(label2idx[line['label']])

  return (x_train, y_train), (x_test, y_test)
In [4]:
label2idx = get_vocab('../vocab/label.txt')
(x_train, y_train), (x_test, y_test) = get_data()

count_model = CountVectorizer(binary = True,
                              ngram_range = (1,2),
                              tokenizer = lambda x: list(x))
count_model.fit(x_train)

tfidf_model = TfidfTransformer()
tfidf_model.fit(count_model.transform(x_train))
X_train_tfidf = tfidf_model.transform(count_model.transform(x_train))
X_test_tfidf = tfidf_model.transform(count_model.transform(x_test))
/usr/local/lib/python3.6/dist-packages/sklearn/feature_extraction/text.py:507: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"
In [5]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=1000)
y_pred = lr_model.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
final_acc = (y_pred == y_test).mean()
print("Testing Accuracy: {:.3f}".format(final_acc))
print('\n'+classification_report(y_true = y_test,
                                 y_pred = y_pred,
                                 labels = list(label2idx.values()),
                                 target_names = list(label2idx.keys()),
                                 digits = 3,))
Testing Accuracy: 0.591

              precision    recall  f1-score   support

     sadness      0.566     0.840     0.676      1448
   happiness      0.640     0.715     0.675       978
        like      0.667     0.362     0.469       453
       anger      0.477     0.233     0.313       447
        fear      0.615     0.119     0.200        67
    surprise      0.769     0.097     0.172       103
     disgust      0.605     0.306     0.406       471

    accuracy                          0.591      3967
   macro avg      0.620     0.382     0.416      3967
weighted avg      0.597     0.591     0.558      3967