Notebook

Import necessary dependencies¶

In [9]:

import pandas as pd
import numpy as np
import text_normalizer as tn
import warnings

warnings.filterwarnings("ignore")

Load and normalize data¶

In [2]:

dataset = pd.read_csv(r'movie_reviews.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Extract features from positive and negative reviews¶

In [3]:

from sklearn.feature_extraction.text import TfidfVectorizer

# consolidate all normalized reviews
norm_reviews = norm_train_reviews+norm_test_reviews
# get tf-idf features for only positive reviews
positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']
ptvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ptvf_features = ptvf.fit_transform(positive_reviews)
# get tf-idf features for only negative reviews
negative_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'negative']
ntvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ntvf_features = ntvf.fit_transform(negative_reviews)
# view feature set dimensions
print(ptvf_features.shape, ntvf_features.shape)

(25000, 331) (25000, 331)

Topic Modeling on Reviews¶

In [4]:

import pyLDAvis
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import topic_model_utils as tmu

pyLDAvis.enable_notebook()
total_topics = 10

Display and visualize topics for positive reviews¶

In [5]:

# build topic model on positive sentiment review features
pos_nmf = NMF(n_components=total_topics, 
          random_state=42, alpha=0.1, l1_ratio=0.2)
pos_nmf.fit(ptvf_features)      
# extract features and component weights
pos_feature_names = ptvf.get_feature_names()
pos_weights = pos_nmf.components_
# extract and display topics and their components
pos_topics = tmu.get_topics_terms_weights(pos_weights, pos_feature_names)
tmu.print_topics_udf(topics=pos_topics,
                 total_topics=total_topics,
                 num_terms=15,
                 display_weights=False)

Topic #1 without weights
['like', 'not', 'think', 'really', 'say', 'would', 'get', 'know', 'thing', 'much', 'bad', 'go', 'lot', 'could', 'even']

Topic #2 without weights
['movie', 'see', 'watch', 'great', 'good', 'one', 'not', 'time', 'ever', 'enjoy', 'recommend', 'make', 'acting', 'like', 'first']

Topic #3 without weights
['show', 'episode', 'series', 'tv', 'watch', 'dvd', 'first', 'see', 'time', 'one', 'good', 'year', 'remember', 'ever', 'would']

Topic #4 without weights
['performance', 'role', 'play', 'actor', 'cast', 'good', 'well', 'great', 'character', 'excellent', 'give', 'also', 'support', 'star', 'job']

Topic #5 without weights
['man', 'young', 'old', 'two', 'get', 'year', 'woman', 'take', 'go', 'come', 'find', 'back', 'girl', 'father', 'friend']

Topic #6 without weights
['film', 'see', 'one', 'scene', 'make', 'not', 'time', 'director', 'horror', 'music', 'many', 'cinema', 'release', 'work', 'use']

Topic #7 without weights
['story', 'tell', 'character', 'true', 'book', 'well', 'line', 'base', 'interesting', 'end', 'simple', 'read', 'beautiful', 'main', 'different']

Topic #8 without weights
['funny', 'comedy', 'laugh', 'humor', 'fun', 'moment', 'line', 'not', 'guy', 'get', 'make', 'lot', 'one', 'time', 'show']

Topic #9 without weights
['life', 'world', 'people', 'us', 'real', 'live', 'human', 'war', 'many', 'show', 'not', 'way', 'no', 'make', 'feel']

Topic #10 without weights
['love', 'fall', 'song', 'wonderful', 'beautiful', 'music', 'heart', 'girl', 'would', 'watch', 'great', 'favorite', 'always', 'family', 'woman']

In [10]:

pyLDAvis.sklearn.prepare(pos_nmf, ptvf_features, ptvf, R=15)

Out[10]:

Display and visualize topics for negative reviews¶

In [7]:

# build topic model on negative sentiment review features
neg_nmf = NMF(n_components=10, 
          random_state=42, alpha=0.1, l1_ratio=0.2)
neg_nmf.fit(ntvf_features)      
# extract features and component weights
neg_feature_names = ntvf.get_feature_names()
neg_weights = neg_nmf.components_
# extract and display topics and their components
neg_topics = tmu.get_topics_terms_weights(neg_weights, neg_feature_names)
tmu.print_topics_udf(topics=neg_topics,
                 total_topics=total_topics,
                 num_terms=15,
                 display_weights=False) 

Topic #1 without weights
['get', 'go', 'kill', 'guy', 'scene', 'take', 'end', 'back', 'start', 'around', 'look', 'one', 'thing', 'come', 'first']

Topic #2 without weights
['bad', 'movie', 'ever', 'acting', 'see', 'terrible', 'one', 'plot', 'effect', 'awful', 'not', 'even', 'make', 'horrible', 'special']

Topic #3 without weights
['film', 'make', 'not', 'see', 'would', 'director', 'one', 'many', 'may', 'bad', 'however', 'horror', 'no', 'say', 'feel']

Topic #4 without weights
['character', 'story', 'book', 'plot', 'main', 'seem', 'no', 'interesting', 'not', 'movie', 'read', 'end', 'feel', 'nothing', 'original']

Topic #5 without weights
['movie', 'think', 'would', 'not', 'like', 'say', 'watch', 'could', 'see', 'really', 'people', 'good', 'know', 'want', 'make']

Topic #6 without weights
['funny', 'comedy', 'laugh', 'joke', 'try', 'not', 'stupid', 'suppose', 'moment', 'fun', 'even', 'black', 'guy', 'character', 'really']

Topic #7 without weights
['actor', 'play', 'good', 'cast', 'role', 'performance', 'script', 'much', 'great', 'star', 'act', 'look', 'well', 'give', 'director']

Topic #8 without weights
['man', 'woman', 'old', 'young', 'year', 'life', 'love', 'girl', 'child', 'play', 'sex', 'wife', 'family', 'boy', 'kid']

Topic #9 without weights
['show', 'tv', 'series', 'watch', 'not', 'original', 'people', 'like', 'every', 'new', 'kid', 'us', 'make', 'use', 'american']

Topic #10 without weights
['waste', 'time', 'money', 'watch', 'minute', 'hour', 'movie', 'spend', 'not', 'life', 'save', 'even', 'worth', 'back', 'crap']

In [11]:

pyLDAvis.sklearn.prepare(neg_nmf, ntvf_features, ntvf, R=15)

Out[11]: