import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu
np.set_printoptions(precision=2, linewidth=80)
dataset = pd.read_csv(r'movie_reviews.csv')
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])
# extract data for model evaluation
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]
sample_review_ids = [7626, 3533, 13010]
# normalize dataset
norm_test_reviews = tn.normalize_corpus(test_reviews)
from afinn import Afinn
afn = Afinn(emoticons=True)
for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):
print('REVIEW:', review)
print('Actual Sentiment:', sentiment)
print('Predicted Sentiment polarity:', afn.score(review))
print('-'*60)
REVIEW: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT! Actual Sentiment: negative Predicted Sentiment polarity: -7.0 ------------------------------------------------------------ REVIEW: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one. Actual Sentiment: positive Predicted Sentiment polarity: 3.0 ------------------------------------------------------------ REVIEW: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot Actual Sentiment: positive Predicted Sentiment polarity: -3.0 ------------------------------------------------------------
sentiment_polarity = [afn.score(review) for review in test_reviews]
predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments,
classes=['positive', 'negative'])
Model Performance metrics: ------------------------------ Accuracy: 0.71 Precision: 0.73 Recall: 0.71 F1 Score: 0.71 Model Classification report: ------------------------------ precision recall f1-score support positive 0.67 0.85 0.75 7510 negative 0.79 0.57 0.67 7490 avg / total 0.73 0.71 0.71 15000 Prediction Confusion Matrix: ------------------------------ Predicted: positive negative Actual: positive 6376 1134 negative 3189 4301
from nltk.corpus import sentiwordnet as swn
awesome = list(swn.senti_synsets('awesome', 'a'))[0]
print('Positive Polarity Score:', awesome.pos_score())
print('Negative Polarity Score:', awesome.neg_score())
print('Objective Score:', awesome.obj_score())
Positive Polarity Score: 0.875 Negative Polarity Score: 0.125 Objective Score: 0.0
def analyze_sentiment_sentiwordnet_lexicon(review,
verbose=False):
# tokenize and POS tag text tokens
tagged_text = [(token.text, token.tag_) for token in tn.nlp(review)]
pos_score = neg_score = token_count = obj_score = 0
# get wordnet synsets based on POS tags
# get sentiment scores if synsets are found
for word, tag in tagged_text:
ss_set = None
if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
ss_set = list(swn.senti_synsets(word, 'n'))[0]
elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
ss_set = list(swn.senti_synsets(word, 'v'))[0]
elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
ss_set = list(swn.senti_synsets(word, 'a'))[0]
elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
ss_set = list(swn.senti_synsets(word, 'r'))[0]
# if senti-synset is found
if ss_set:
# add scores for all found synsets
pos_score += ss_set.pos_score()
neg_score += ss_set.neg_score()
obj_score += ss_set.obj_score()
token_count += 1
# aggregate final scores
final_score = pos_score - neg_score
norm_final_score = round(float(final_score) / token_count, 2)
final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
if verbose:
norm_obj_score = round(float(obj_score) / token_count, 2)
norm_pos_score = round(float(pos_score) / token_count, 2)
norm_neg_score = round(float(neg_score) / token_count, 2)
# to display results in a nice table
sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score, norm_pos_score,
norm_neg_score, norm_final_score]],
columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'],
['Predicted Sentiment', 'Objectivity',
'Positive', 'Negative', 'Overall']],
labels=[[0,0,0,0,0],[0,1,2,3,4]]))
print(sentiment_frame)
return final_sentiment
for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):
print('REVIEW:', review)
print('Actual Sentiment:', sentiment)
pred = analyze_sentiment_sentiwordnet_lexicon(review, verbose=True)
print('-'*60)
REVIEW: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT! Actual Sentiment: negative SENTIMENT STATS: Predicted Sentiment Objectivity Positive Negative Overall 0 negative 0.76 0.09 0.15 -0.06 ------------------------------------------------------------ REVIEW: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one. Actual Sentiment: positive SENTIMENT STATS: Predicted Sentiment Objectivity Positive Negative Overall 0 positive 0.74 0.2 0.06 0.14 ------------------------------------------------------------ REVIEW: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot Actual Sentiment: positive SENTIMENT STATS: Predicted Sentiment Objectivity Positive Negative Overall 0 positive 0.8 0.14 0.07 0.07 ------------------------------------------------------------
predicted_sentiments = [analyze_sentiment_sentiwordnet_lexicon(review, verbose=False) for review in norm_test_reviews]
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments,
classes=['positive', 'negative'])
Model Performance metrics: ------------------------------ Accuracy: 0.69 Precision: 0.69 Recall: 0.69 F1 Score: 0.68 Model Classification report: ------------------------------ precision recall f1-score support positive 0.66 0.76 0.71 7510 negative 0.72 0.61 0.66 7490 avg / total 0.69 0.69 0.68 15000 Prediction Confusion Matrix: ------------------------------ Predicted: positive negative Actual: positive 5742 1768 negative 2932 4558
from nltk.sentiment.vader import SentimentIntensityAnalyzer
C:\Program Files\Anaconda3\lib\site-packages\nltk\twitter\__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available. warnings.warn("The twython library has not been installed. "
def analyze_sentiment_vader_lexicon(review,
threshold=0.1,
verbose=False):
# pre-process text
review = tn.strip_html_tags(review)
review = tn.remove_accented_chars(review)
review = tn.expand_contractions(review)
# analyze the sentiment for review
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(review)
# get aggregate scores and final sentiment
agg_score = scores['compound']
final_sentiment = 'positive' if agg_score >= threshold\
else 'negative'
if verbose:
# display detailed sentiment statistics
positive = str(round(scores['pos'], 2)*100)+'%'
final = round(agg_score, 2)
negative = str(round(scores['neg'], 2)*100)+'%'
neutral = str(round(scores['neu'], 2)*100)+'%'
sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
negative, neutral]],
columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'],
['Predicted Sentiment', 'Polarity Score',
'Positive', 'Negative', 'Neutral']],
labels=[[0,0,0,0,0],[0,1,2,3,4]]))
print(sentiment_frame)
return final_sentiment
for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):
print('REVIEW:', review)
print('Actual Sentiment:', sentiment)
pred = analyze_sentiment_vader_lexicon(review, threshold=0.4, verbose=True)
print('-'*60)
REVIEW: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT! Actual Sentiment: negative SENTIMENT STATS: Predicted Sentiment Polarity Score Positive Negative Neutral 0 negative -0.8 0.0% 40.0% 60.0% ------------------------------------------------------------ REVIEW: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one. Actual Sentiment: positive SENTIMENT STATS: Predicted Sentiment Polarity Score Positive Negative Neutral 0 negative -0.16 16.0% 14.000000000000002% 69.0% ------------------------------------------------------------ REVIEW: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot Actual Sentiment: positive SENTIMENT STATS: Predicted Sentiment Polarity Score Positive Negative Neutral 0 positive 0.49 11.0% 11.0% 77.0% ------------------------------------------------------------
predicted_sentiments = [analyze_sentiment_vader_lexicon(review, threshold=0.4, verbose=False) for review in test_reviews]
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments,
classes=['positive', 'negative'])
Model Performance metrics: ------------------------------ Accuracy: 0.71 Precision: 0.72 Recall: 0.71 F1 Score: 0.71 Model Classification report: ------------------------------ precision recall f1-score support positive 0.67 0.83 0.74 7510 negative 0.78 0.59 0.67 7490 avg / total 0.72 0.71 0.71 15000 Prediction Confusion Matrix: ------------------------------ Predicted: positive negative Actual: positive 6235 1275 negative 3068 4422