import pandas as pd
import numpy as np
import text_normalizer as tn
import warnings
warnings.filterwarnings("ignore")
dataset = pd.read_csv(r'movie_reviews.csv')
# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])
# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]
# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)
review sentiment 0 One of the other reviewers has mentioned that ... positive 1 A wonderful little production. <br /><br />The... positive 2 I thought this was a wonderful way to spend ti... positive 3 Basically there's a family where a little boy ... negative 4 Petter Mattei's "Love in the Time of Money" is... positive
from sklearn.feature_extraction.text import TfidfVectorizer
# consolidate all normalized reviews
norm_reviews = norm_train_reviews+norm_test_reviews
# get tf-idf features for only positive reviews
positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']
ptvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ptvf_features = ptvf.fit_transform(positive_reviews)
# get tf-idf features for only negative reviews
negative_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'negative']
ntvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ntvf_features = ntvf.fit_transform(negative_reviews)
# view feature set dimensions
print(ptvf_features.shape, ntvf_features.shape)
(25000, 331) (25000, 331)
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import topic_model_utils as tmu
pyLDAvis.enable_notebook()
total_topics = 10
# build topic model on positive sentiment review features
pos_nmf = NMF(n_components=total_topics,
random_state=42, alpha=0.1, l1_ratio=0.2)
pos_nmf.fit(ptvf_features)
# extract features and component weights
pos_feature_names = ptvf.get_feature_names()
pos_weights = pos_nmf.components_
# extract and display topics and their components
pos_topics = tmu.get_topics_terms_weights(pos_weights, pos_feature_names)
tmu.print_topics_udf(topics=pos_topics,
total_topics=total_topics,
num_terms=15,
display_weights=False)
Topic #1 without weights ['like', 'not', 'think', 'really', 'say', 'would', 'get', 'know', 'thing', 'much', 'bad', 'go', 'lot', 'could', 'even'] Topic #2 without weights ['movie', 'see', 'watch', 'great', 'good', 'one', 'not', 'time', 'ever', 'enjoy', 'recommend', 'make', 'acting', 'like', 'first'] Topic #3 without weights ['show', 'episode', 'series', 'tv', 'watch', 'dvd', 'first', 'see', 'time', 'one', 'good', 'year', 'remember', 'ever', 'would'] Topic #4 without weights ['performance', 'role', 'play', 'actor', 'cast', 'good', 'well', 'great', 'character', 'excellent', 'give', 'also', 'support', 'star', 'job'] Topic #5 without weights ['man', 'young', 'old', 'two', 'get', 'year', 'woman', 'take', 'go', 'come', 'find', 'back', 'girl', 'father', 'friend'] Topic #6 without weights ['film', 'see', 'one', 'scene', 'make', 'not', 'time', 'director', 'horror', 'music', 'many', 'cinema', 'release', 'work', 'use'] Topic #7 without weights ['story', 'tell', 'character', 'true', 'book', 'well', 'line', 'base', 'interesting', 'end', 'simple', 'read', 'beautiful', 'main', 'different'] Topic #8 without weights ['funny', 'comedy', 'laugh', 'humor', 'fun', 'moment', 'line', 'not', 'guy', 'get', 'make', 'lot', 'one', 'time', 'show'] Topic #9 without weights ['life', 'world', 'people', 'us', 'real', 'live', 'human', 'war', 'many', 'show', 'not', 'way', 'no', 'make', 'feel'] Topic #10 without weights ['love', 'fall', 'song', 'wonderful', 'beautiful', 'music', 'heart', 'girl', 'would', 'watch', 'great', 'favorite', 'always', 'family', 'woman']
pyLDAvis.sklearn.prepare(pos_nmf, ptvf_features, ptvf, R=15)
# build topic model on negative sentiment review features
neg_nmf = NMF(n_components=10,
random_state=42, alpha=0.1, l1_ratio=0.2)
neg_nmf.fit(ntvf_features)
# extract features and component weights
neg_feature_names = ntvf.get_feature_names()
neg_weights = neg_nmf.components_
# extract and display topics and their components
neg_topics = tmu.get_topics_terms_weights(neg_weights, neg_feature_names)
tmu.print_topics_udf(topics=neg_topics,
total_topics=total_topics,
num_terms=15,
display_weights=False)
Topic #1 without weights ['get', 'go', 'kill', 'guy', 'scene', 'take', 'end', 'back', 'start', 'around', 'look', 'one', 'thing', 'come', 'first'] Topic #2 without weights ['bad', 'movie', 'ever', 'acting', 'see', 'terrible', 'one', 'plot', 'effect', 'awful', 'not', 'even', 'make', 'horrible', 'special'] Topic #3 without weights ['film', 'make', 'not', 'see', 'would', 'director', 'one', 'many', 'may', 'bad', 'however', 'horror', 'no', 'say', 'feel'] Topic #4 without weights ['character', 'story', 'book', 'plot', 'main', 'seem', 'no', 'interesting', 'not', 'movie', 'read', 'end', 'feel', 'nothing', 'original'] Topic #5 without weights ['movie', 'think', 'would', 'not', 'like', 'say', 'watch', 'could', 'see', 'really', 'people', 'good', 'know', 'want', 'make'] Topic #6 without weights ['funny', 'comedy', 'laugh', 'joke', 'try', 'not', 'stupid', 'suppose', 'moment', 'fun', 'even', 'black', 'guy', 'character', 'really'] Topic #7 without weights ['actor', 'play', 'good', 'cast', 'role', 'performance', 'script', 'much', 'great', 'star', 'act', 'look', 'well', 'give', 'director'] Topic #8 without weights ['man', 'woman', 'old', 'young', 'year', 'life', 'love', 'girl', 'child', 'play', 'sex', 'wife', 'family', 'boy', 'kid'] Topic #9 without weights ['show', 'tv', 'series', 'watch', 'not', 'original', 'people', 'like', 'every', 'new', 'kid', 'us', 'make', 'use', 'american'] Topic #10 without weights ['waste', 'time', 'money', 'watch', 'minute', 'hour', 'movie', 'spend', 'not', 'life', 'save', 'even', 'worth', 'back', 'crap']
pyLDAvis.sklearn.prepare(neg_nmf, ntvf_features, ntvf, R=15)