Import necessary dependencies

In [1]:
import pandas as pd
import numpy as np
import text_normalizer as tn

Load and normalize data

In [2]:
dataset = pd.read_csv(r'movie_reviews.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Build Text Classification Pipeline with The Best Model

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore")

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)
# build Logistic Regression model
lr = LogisticRegression()
lr.fit(cv_train_features, train_sentiments)

# Build Text Classification Pipeline
lr_pipeline = make_pipeline(cv, lr)

# save the list of prediction classes (positive, negative)
classes = list(lr_pipeline.classes_)

Analyze Model Prediction Probabilities

In [4]:
lr_pipeline.predict(['the lord of the rings is an excellent movie', 
                     'i hated the recent movie on tv, it was so bad'])
Out[4]:
array(['positive', 'negative'], dtype=object)
In [5]:
pd.DataFrame(lr_pipeline.predict_proba(['the lord of the rings is an excellent movie', 
                     'i hated the recent movie on tv, it was so bad']), columns=classes)
Out[5]:
negative positive
0 0.169653 0.830347
1 0.730814 0.269186

Interpreting Model Decisions

In [6]:
from skater.core.local_interpretation.lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=classes)
def interpret_classification_model_prediction(doc_index, norm_corpus, corpus, 
                                              prediction_labels, explainer_obj):
    # display model prediction and actual sentiments
    print("Test document index: {index}\nActual sentiment: {actual}\nPredicted sentiment: {predicted}"
      .format(index=doc_index, actual=prediction_labels[doc_index],
              predicted=lr_pipeline.predict([norm_corpus[doc_index]])))
    # display actual review content
    print("\nReview:", corpus[doc_index])
    # display prediction probabilities
    print("\nModel Prediction Probabilities:")
    for probs in zip(classes, lr_pipeline.predict_proba([norm_corpus[doc_index]])[0]):
        print(probs)
    # display model prediction interpretation
    exp = explainer.explain_instance(norm_corpus[doc_index], 
                                     lr_pipeline.predict_proba, num_features=10, 
                                     labels=[1])
    exp.show_in_notebook()
In [7]:
doc_index = 100 
interpret_classification_model_prediction(doc_index=doc_index, norm_corpus=norm_test_reviews,
                                         corpus=test_reviews, prediction_labels=test_sentiments,
                                         explainer_obj=explainer)
Test document index: 100
Actual sentiment: negative
Predicted sentiment: ['negative']

Review: Worst movie, (with the best reviews given it) I've ever seen. Over the top dialog, acting, and direction. more slasher flick than thriller.With all the great reviews this movie got I'm appalled that it turned out so silly. shame on you martin scorsese

Model Prediction Probabilities:
('negative', 0.8099323456145181)
('positive', 0.19006765438548187)