In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/clue/main')
Mounted at /content/gdrive
In [2]:
!pip install jieba
Requirement already satisfied: jieba in /usr/local/lib/python3.6/dist-packages (0.42.1)
In [3]:
import json
import jieba

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
In [4]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx


def get_data():
  x_train = []
  y_train = []
  x_test = []
  y_test = []

  with open('../data/train.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_train.append(''.join(list(text)))
      y_train.append(label2idx[line['label']])

  with open('../data/test.txt') as f:
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      x_test.append(''.join(list(text)))
      y_test.append(label2idx[line['label']])

  return (x_train, y_train), (x_test, y_test)
In [5]:
label2idx = get_vocab('../vocab/label.txt')
(x_train, y_train), (x_test, y_test) = get_data()

count_model = CountVectorizer(binary = False,
                              ngram_range = (1,2),
                              tokenizer = lambda x: jieba.lcut(x))
count_model.fit(x_train)

tfidf_model = TfidfTransformer()
tfidf_model.fit(count_model.transform(x_train))
X_train_tfidf = tfidf_model.transform(count_model.transform(x_train))
X_test_tfidf = tfidf_model.transform(count_model.transform(x_test))
/usr/local/lib/python3.6/dist-packages/sklearn/feature_extraction/text.py:507: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"
Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.936 seconds.
Prefix dict has been built successfully.
In [6]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=1000)
y_pred = lr_model.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
final_acc = (y_pred == y_test).mean()
print("Testing Accuracy: {:.3f}".format(final_acc))
print('\n'+classification_report(y_true = y_test,
                                 y_pred = y_pred,
                                 labels = list(label2idx.values()),
                                 target_names = list(label2idx.keys()),
                                 digits = 3,))
Testing Accuracy: 0.578

              precision    recall  f1-score   support

     sadness      0.545     0.836     0.660      1448
   happiness      0.627     0.687     0.656       978
        like      0.665     0.347     0.456       453
       anger      0.492     0.219     0.303       447
        fear      0.667     0.119     0.203        67
    surprise      0.714     0.049     0.091       103
     disgust      0.636     0.297     0.405       471

    accuracy                          0.578      3967
   macro avg      0.621     0.365     0.396      3967
weighted avg      0.590     0.578     0.543      3967