#!/usr/bin/env python # coding: utf-8 # #Topic Modeling in Multi-Aspect Reviews # ###By Ben Cohen - bn.chn2 [at] gmail.com # The purpose of this project is to investigate topic modeling in multi-aspect reviews. More specifically, I wanted to investigate a way to find the words in reviews which were associated with the different categories being rated. # # Since I, like seemingly all data sciencists, love beer, I was thrilled to find a dataset containing about 1.5 million beer reviews from the beeradvocate website. # # Below is a summary of my workflow and findings in playing around with this dataset. # #Imports # In[1]: import pickle import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import glob from textblob import TextBlob from sklearn import linear_model import numpy as np # I did a bit of preprocessing on the data. The data came as a single (huge) .txt file. I defined a python class as seen below, and parsed the giant file into "Review" objects and saved each one as a pickled file in a directory called "BeerData". # # Each review came with a bunch of data, most of which was ignored for this project. There's definitely a lot of digging to do though! # In[2]: class Review(object): def __init__(self, name, beerID, brewerID, abv, style, appearance, aroma, palate, taste, overall, time, reviewer, text): self.name = name self.beerID = beerID self.brewerID = brewerID self.abv = abv self.style = style self.appearance = appearance self.aroma = aroma self.palate = palate self.taste = taste self.overall = overall self.time = time self.reviewer = reviewer self.text = text # Defining a mean function, which will be of use later on. # In[3]: def mean(l): return sum(l)/float(len(l)) # As described earlier, our dataset contains about 1.5 million reviews. Below we read those files and create a list in memory with all of them. # In[9]: print str(len(glob.glob("BeerData/*"))) + " total reviews" # In[4]: reviews = [] for review in glob.glob("BeerData/*"): rv = pickle.load(open(review)) reviews.append(rv) # ##Visualizations # I wanted to get some idea of what my data looked like. Below, I plotted a bargraph of the distributions of each of the 4 rating aspects (taste, mouthfeel, smell, and appearance), as well as the overall score. # # As expected, all of the distributions are roughly skewed left, having a mode of 4, and a mean of ~3.8. The only other interesting trend here is that taste seems to be the most polarizing individual aspect, and look the least. This is unsurprising, as taste is probably the most "interesting" aspect to most people, and look the least. # In[83]: #overall allRatings = [] ratings = {z/2.: 0 for z in range(0,11)} for x in reviews: allRatings.append(x.overall) ratings[x.overall] += 1 print ratings xs = [z/2.0 - .25 for z in range(0,11)] ys = [ratings[z/2.0] for z in range(0,11)] print "Mean: " + str(mean(allRatings)) print "Std Dev: " + str(np.std(allRatings)) plt.bar(xs, ys, .5) plt.show() # In[85]: #taste allRatings = [] ratings = {z/2.: 0 for z in range(0,11)} for x in reviews: allRatings.append(x.taste) ratings[x.taste] += 1 print ratings xs = [z/2.0 - .25 for z in range(0,11)] ys = [ratings[z/2.0] for z in range(0,11)] print "Mean: " + str(mean(allRatings)) print "Std Dev: " + str(np.std(allRatings)) plt.bar(xs, ys, .5) plt.show() # In[87]: #smell allRatings = [] ratings = {z/2.: 0 for z in range(0,11)} for x in reviews: allRatings.append(x.aroma) ratings[x.aroma] += 1 print ratings xs = [z/2.0 - .25 for z in range(0,11)] ys = [ratings[z/2.0] for z in range(0,11)] print "Mean: " + str(mean(allRatings)) print "Std Dev: " + str(np.std(allRatings)) plt.bar(xs, ys, .5) plt.show() # In[88]: #look allRatings = [] ratings = {z/2.: 0 for z in range(0,11)} for x in reviews: allRatings.append(x.appearance) ratings[x.appearance] += 1 print ratings xs = [z/2.0 - .25 for z in range(0,11)] ys = [ratings[z/2.0] for z in range(0,11)] print "Mean: " + str(mean(allRatings)) print "Std Dev: " + str(np.std(allRatings)) plt.bar(xs, ys, .5) plt.show() # In[89]: #palate allRatings = [] ratings = {z/2.: 0 for z in range(0,11)} for x in reviews: allRatings.append(x.palate) ratings[x.palate] += 1 print ratings xs = [z/2.0 - .25 for z in range(0,11)] ys = [ratings[z/2.0] for z in range(0,11)] print "Mean: " + str(mean(allRatings)) print "Std Dev: " + str(np.std(allRatings)) plt.bar(xs, ys, .5) plt.show() # #Topic Modeling # Now comes the fun part! # # My thesis here is that the positive sentiment sentences in reviews where one aspect is much more positive than the others will likely be related to the aspect which got the high rating. In the below cells, we build to dictionaries. # # Each one has a list of sentences that our hypothesis says are assocated with each aspect. To build these, we look at reviews which have one aspect rated at least 1 full point higher than the second highest. We then classify each sentence's sentiment, using the "TextBlob" module, which is trained for general use, not for anything specific on our dataset. # # Sentences with polarity above 0.2 (arbitrarily chosen) are added to our dictionary for further processing. # # We then repeat the process for reviews with one aspect much worse than the rest, and look at sentences with negative sentiments and polarity < -0.2. # In[15]: # ratings = {z/2.: 0 for z in range(0,11)} sentsDct = {x:[] for x in range(4)} for x in reviews: scores = [x.aroma, x.taste, x.palate, x.appearance] sc2 = sorted([x.aroma, x.taste, x.palate, x.appearance], reverse=True) # if len([z for z in scores if z >= 4]) == 1 and len([z for z in scores if z <= 3]) == 3: if sc2[0] - sc2[1] >= 1: # goodScore = [z for z in scores if z >= 4][0] goodScore = sc2[0] #normalize x.text = x.text.lower() x.text = x.text.strip('\(') x.text = x.text.strip('\)') x.text = x.text.strip(',') x.text = x.text.strip('.') x.text = x.text.strip('-') x.text = x.text.strip('-') x.text = x.text.strip('\t') x.text = x.text.strip('\'') sents = x.text.split('.') for s in sents: blob = TextBlob(s) try: if blob.sentiment.polarity > .2: sentsDct[scores.index(goodScore)].append(s.strip()) except: pass # In[90]: # ratings = {z/2.: 0 for z in range(0,11)} badSentsDct = {x:[] for x in range(4)} for x in reviews: scores = [x.aroma, x.taste, x.palate, x.appearance] sc2 = sorted([x.aroma, x.taste, x.palate, x.appearance]) # if len([z for z in scores if z >= 4]) == 1 and len([z for z in scores if z <= 3]) == 3: if sc2[1] - sc2[0] >= 1: # goodScore = [z for z in scores if z >= 4][0] goodScore = sc2[0] #normalize x.text = x.text.lower() x.text = x.text.strip('\(') x.text = x.text.strip('\)') x.text = x.text.strip(',') x.text = x.text.strip('.') x.text = x.text.strip('-') x.text = x.text.strip('-') x.text = x.text.strip('\t') x.text = x.text.strip('\'') sents = x.text.split('.') for s in sents: blob = TextBlob(s) try: if blob.sentiment.polarity < -0.2: badSentsDct[scores.index(goodScore)].append(s.strip()) except: pass # #Filtering # We then define a function which "filters" out words found in one category by looking at how commonly they occur in our other categories. # # Put another way, we use this method to keep words that are both positive or negative and specifically related to the aspect at hand (ie. piney for smell) and remove/lower words that have the correct sentiment, but are used across all of our categories (ie. wonderful may be used to describe a positive sentiment about a beer, but says nothing about a specific category) # In[16]: def moreFreqwords(cl, others): ret = {} ln = float(len(cl)) for word in cl.keys(): pct = cl[word]/ln otherPcts = [] for other in others: if word not in other: otherPcts.append(0.) else: otherPcts.append(other[word]/float(len(other))) if max(pct, max(otherPcts)) == pct: ret[word] = pct - mean(otherPcts) return ret # #Finding related words # # It's finally time to find the words that correspond to "good" and "bad" reviews for each aspect. # # Then, for each category, we do the following: # + Build up a frequency dictionary for each unique word used in those reviews (basically just keep count of how many times we've seen that word) # + Use our filtering method above to rank each word in this dictionary based on relative frequencies between our aspect and the other three. # + Sort this list based on frequency # + Check the part of speech of each of these words, and keep the adjectives (JJ), singular nouns (NN), comparative adjectives (JJR), and adverbs (RB) # # After this, we're done! I simply print out the top 50 words for each group. # # We then repeat this for the other three aspects, followed by repeating the process for the "bad" reviews. # In[60]: #good taste = {} smell = {} look = {} feel = {} for sent in sentsDct[0]: sent = sent.split(' ') for word in sent: if word in smell: smell[word] += 1 else: smell[word] = 1 # smell.add(word) for sent in sentsDct[1]: sent = sent.split(' ') for word in sent: if word in taste: taste[word] += 1 else: taste[word] = 1 for sent in sentsDct[3]: sent = sent.split(' ') for word in sent: if word in look: look[word] += 1 else: look[word] = 1 for sent in sentsDct[2]: sent = sent.split(' ') for word in sent: if word in feel: feel[word] += 1 else: feel[word] = 1 print "TASTE WORDS" uniqueTasteWords = moreFreqwords(taste, [smell, look, feel]) #list(tasteWords - smellWords - lookWords - feelWords) goodTaste = sorted(uniqueTasteWords, key=lambda x: taste[x], reverse=True) newTaste = [] for w in goodTaste: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newTaste.append(w) except: pass print newTaste[:50] # print [p for p in sorted(uniqueTasteWords.keys(), key=lambda x: uniqueTasteWords[x], reverse=True) if TextBlob(p).tags[0][1] in ["NN", "JJ"]] uniqueSmellWords = moreFreqwords(smell, [taste, look, feel]) goodSmell = sorted(uniqueSmellWords, key=lambda x: smell[x], reverse=True) newSmell = [] for w in goodSmell: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newSmell.append(w) except: pass print "SMELL WORDS" print newSmell[:50] uniqueLookWords = moreFreqwords(look, [taste, smell, feel]) goodLook = sorted(uniqueLookWords, key=lambda x: uniqueLookWords[x], reverse=True) newLook = [] for w in goodLook: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newLook.append(w) except: pass print "LOOK WORDS" print newLook[:50] uniqueFeelWords = moreFreqwords(feel, [taste, look, smell]) goodFeel = sorted(uniqueFeelWords, key=lambda x: uniqueFeelWords[x], reverse=True) newFeel = [] for w in goodFeel: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newFeel.append(w) except: pass print "FEEL WORDS" print newFeel[:50] # In[94]: #bad taste = {} smell = {} look = {} feel = {} for sent in badSentsDct[0]: sent = sent.split(' ') for word in sent: if word in smell: smell[word] += 1 else: smell[word] = 1 for sent in badSentsDct[1]: sent = sent.split(' ') for word in sent: if word in taste: taste[word] += 1 else: taste[word] = 1 for sent in badSentsDct[3]: sent = sent.split(' ') for word in sent: if word in look: look[word] += 1 else: look[word] = 1 for sent in badSentsDct[2]: sent = sent.split(' ') for word in sent: if word in feel: feel[word] += 1 else: feel[word] = 1 print "TASTE WORDS" uniqueTasteWords = moreFreqwords(taste, [smell, look, feel]) #list(tasteWords - smellWords - lookWords - feelWords) goodTaste = sorted(uniqueTasteWords, key=lambda x: taste[x], reverse=True) newTasteBad = [] for w in goodTaste: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newTasteBad.append(w) except: pass print newTasteBad[:50] uniqueSmellWords = moreFreqwords(smell, [taste, look, feel]) goodSmell = sorted(uniqueSmellWords, key=lambda x: smell[x], reverse=True) newSmellBad = [] for w in goodSmell: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newSmellBad.append(w) except: pass print "SMELL WORDS" print newSmellBad[:50] uniqueLookWords = moreFreqwords(look, [taste, smell, feel]) goodLook = sorted(uniqueLookWords, key=lambda x: uniqueLookWords[x], reverse=True) newLookBad = [] for w in goodLook: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newLookBad.append(w) except: pass print "LOOK WORDS" print newLookBad[:50] uniqueFeelWords = moreFreqwords(feel, [taste, look, smell]) goodFeel = sorted(uniqueFeelWords, key=lambda x: uniqueFeelWords[x], reverse=True) newFeelBad = [] for w in goodFeel: try: if TextBlob(w).tags[0][1] in ["NN", "JJ", "JJR", "RB"]: newFeelBad.append(w) except: pass print "FEEL WORDS" print newFeelBad[:50] # We use this cell to print out the top 100 words for our categories for the purpose of making a slightly more interesing wordcloud. # In[118]: print " ".join(newLookBad[:100]) # Now let's look at some pretty wordclouds of our findings. Important to note is that the sizes of words were determined randomly by the wordcloud generator (http://www.jasondavies.com/wordcloud/#) and mean nothing. For true ranking of aspects, see the output of the above cells. # #Positive Aspects # In[125]: from IPython.display import SVG, Image SVG(filename='smell.svg') # In[105]: SVG(filename='taste.svg') # In[107]: SVG(filename='look.svg') # In[113]: SVG(filename='feel.svg') # #Now the bad keywords for each aspect # In[114]: SVG(filename='smellBad.svg') # In[115]: SVG(filename='tasteBad.svg') # In[124]: #using a PNG instead of an SVG here as for some reason the outputted SVG is malformed Image(filename='lookBad.png') # In[117]: SVG(filename='feelBad.svg') # #Analysis # To me, these look great! Almost all of the words seem relevant to their specific aspect, and they seem like they have the right polarity. # # That said, I wanted a slightly more complex analysis than "well, it looks good". Below, I build a basic linear regression classifier trained on the first 50 words from each group, and attemp to use this to predict the overall score. # # I don't explore this very fully at all, but demonstrate that it beats the baseline system (assigning every beer the mean score). In future work, I plan to explore possible uses for these words, including individual aspect prediction. # In[19]: def features(txt, keyWordsLst): ln = float(len(txt.split(' '))) txt = txt.lower() txt = txt.strip(',') txt = txt.strip('.') txt = txt.strip(')') txt = txt.strip('(') txt = txt.strip('-') vec = [] for word in keyWordsLst: # print word vec.append(txt.count(word)/ln) # print vec return vec # In[22]: import random random.shuffle(reviews) # In[75]: keyWords = list(newFeel[:50] + newLook[:50] + newTaste[:50] + newSmell[:50]\ + newFeelBad[:50] + newLookBad[:50] + newTasteBad[:50] + newSmellBad[:50]) xs, ys = [], [] for review in reviews[:100000]: vec = features(review.text, keyWords) xs.append(vec) ys.append(review.overall) regr = linear_model.LinearRegression() regr.fit(xs, ys) # In[46]: def dist(x,y): return np.sqrt(np.sum((x-y)**2)) # In[128]: diffs = [] for review in reviews[100000:101000]: vec = features(review.text, keyWords) pre = regr.predict(vec) x = review diff = abs(x.overall - pre)**2 diffs.append(diff) print "Mean difference: " + str(mean(diffs)) # In[129]: diffs = [] for review in reviews[100000:101000]: pre = 3.82 x = review diff = abs(x.overall - pre)**2 diffs.append(diff) print "Mean difference: " + str(mean(diffs)) # #Conclusions and Future work # Above we took a look at a very simple, yet seemingly very effective way to isolate words that indicate either a positive or negative sentiment about a single aspect of a multi-aspect review. Additionally, we do so in an unsupervised manner (ie. no seed words, word or sentence labeling are required). Additionally, we apply no domain specific knowledge, leading me to believe that this method will generalize well to other datasets/tasks. # # In future work, I hope to: # + Look at domain specific sentiment analysis models to improve sentiment tagging. # + Use this to build a more robust and accurate prediction model. # #License # # Copyright (c) 2015, Ben Cohen All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, pulverize, distribute, synergize, compost, defenestrate, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. # # If the Author of the Software (the "Author") needs a place to crash and you have a sofa available, you should maybe give the Author a break and let him sleep on your couch. # # If you are caught in a dire situation wherein you only have enough time to save one person out of a group, and the Author is a member of that group, you must save the Author. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO BLAH BLAH BLAH ISN'T IT FUNNY HOW UPPER-CASE MAKES IT SOUND LIKE THE LICENSE IS ANGRY AND SHOUTING AT YOU. # In[ ]: