In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/clue/main')
Mounted at /content/gdrive
In [2]:
import json

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
In [3]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx


def get_data():
  x_train = []
  y_train = []
  x_test = []
  y_test = []

  with open('../data/train.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_train.append(''.join(list(text)))
      y_train.append(label2idx[line['label']])

  with open('../data/test.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_test.append(''.join(list(text)))
      y_test.append(label2idx[line['label']])

  return (x_train, y_train), (x_test, y_test)
In [4]:
label2idx = get_vocab('../vocab/label.txt')
(x_train, y_train), (x_test, y_test) = get_data()

count_model = CountVectorizer(binary = False,
                              ngram_range = (1,2),
                              tokenizer = lambda x: list(x))
count_model.fit(x_train)

tfidf_model = TfidfTransformer()
tfidf_model.fit(count_model.transform(x_train))
X_train_tfidf = tfidf_model.transform(count_model.transform(x_train))
X_test_tfidf = tfidf_model.transform(count_model.transform(x_test))
/usr/local/lib/python3.6/dist-packages/sklearn/feature_extraction/text.py:507: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"
In [5]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=1000)
y_pred = lr_model.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
final_acc = (y_pred == y_test).mean()
print("Testing Accuracy: {:.3f}".format(final_acc))
print('\n'+classification_report(y_true = y_test,
                                 y_pred = y_pred,
                                 labels = list(label2idx.values()),
                                 target_names = list(label2idx.keys()),
                                 digits = 3,))
Testing Accuracy: 0.594

              precision    recall  f1-score   support

     sadness      0.573     0.832     0.679      1448
   happiness      0.643     0.707     0.673       978
        like      0.636     0.371     0.469       453
       anger      0.490     0.266     0.345       447
        fear      0.600     0.134     0.220        67
    surprise      0.917     0.107     0.191       103
     disgust      0.599     0.327     0.423       471

    accuracy                          0.594      3967
   macro avg      0.637     0.392     0.429      3967
weighted avg      0.601     0.594     0.565      3967