In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/clue/main')
Mounted at /content/gdrive
In [2]:
!pip install jieba
Requirement already satisfied: jieba in /usr/local/lib/python3.6/dist-packages (0.42.1)
In [3]:
import json
import jieba

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
In [4]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx


def get_data():
  x_train = []
  y_train = []
  x_test = []
  y_test = []

  with open('../data/train.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_train.append(''.join(list(text)))
      y_train.append(label2idx[line['label']])

  with open('../data/test.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_test.append(''.join(list(text)))
      y_test.append(label2idx[line['label']])

  return (x_train, y_train), (x_test, y_test)
In [5]:
label2idx = get_vocab('../vocab/label.txt')
(x_train, y_train), (x_test, y_test) = get_data()

count_model = CountVectorizer(binary = True,
                              ngram_range = (1,1),
                              tokenizer = lambda x: jieba.lcut(x))
count_model.fit(x_train)

tfidf_model = TfidfTransformer()
tfidf_model.fit(count_model.transform(x_train))
X_train_tfidf = tfidf_model.transform(count_model.transform(x_train))
X_test_tfidf = tfidf_model.transform(count_model.transform(x_test))
/usr/local/lib/python3.6/dist-packages/sklearn/feature_extraction/text.py:507: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"
Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.859 seconds.
Prefix dict has been built successfully.
In [6]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=1000)
y_pred = lr_model.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
final_acc = (y_pred == y_test).mean()
print("Testing Accuracy: {:.3f}".format(final_acc))
print('\n'+classification_report(y_true = y_test,
                                 y_pred = y_pred,
                                 labels = list(label2idx.values()),
                                 target_names = list(label2idx.keys()),
                                 digits = 3,))
Testing Accuracy: 0.577

              precision    recall  f1-score   support

     sadness      0.557     0.816     0.662      1448
   happiness      0.629     0.689     0.658       978
        like      0.650     0.364     0.467       453
       anger      0.436     0.235     0.305       447
        fear      0.562     0.134     0.217        67
    surprise      0.714     0.097     0.171       103
     disgust      0.581     0.306     0.401       471

    accuracy                          0.577      3967
   macro avg      0.590     0.377     0.411      3967
weighted avg      0.579     0.577     0.547      3967