In [0]:
import tensorflow as tf
import numpy as np
import time

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
In [0]:
def get_data():
  _word2idx = tf.keras.datasets.imdb.get_word_index()
  word2idx = {w: i+3 for w, i in _word2idx.items()}
  word2idx['<pad>'] = 0
  word2idx['<start>'] = 1
  word2idx['<unk>'] = 2
  idx2word = {i: w for w, i in word2idx.items()}
  
  (X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data()
  x_train = [' '.join([idx2word[i] for i in document][1:]) for document in X_train]
  x_test = [' '.join([idx2word[i] for i in document][1:]) for document in X_test]
  return (x_train, y_train), (x_test, y_test)
In [0]:
(x_train, y_train), (x_test, y_test) = get_data()

count_model = CountVectorizer(binary=True)
count_model.fit(x_train)

tfidf_model = TfidfTransformer()
tfidf_model.fit(count_model.transform(x_train))
X_train_tfidf = tfidf_model.transform(count_model.transform(x_train))
X_test_tfidf = tfidf_model.transform(count_model.transform(x_test))
In [4]:
lr_model = LogisticRegression(solver='lbfgs')
y_pred = lr_model.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
final_acc = (y_pred == y_test).mean()
print("final testing accuracy: {:.4f}".format(final_acc))
final testing accuracy: 0.8882