Adapted from NLP Crash Course by Charlie Greenbacker and Introduction to NLP by Dan Jurafsky
NLP requires an understanding of the language and the world.
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
# read yelp.csv into a DataFrame
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/yelp.csv'
yelp = pd.read_csv(url)
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
# rows are documents, columns are terms (aka "tokens" or "features")
X_train_dtm.shape
(3064, 16825)
# last 50 features
print vect.get_feature_names()[-50:]
[u'yyyyy', u'z11', u'za', u'zabba', u'zach', u'zam', u'zanella', u'zankou', u'zappos', u'zatsiki', u'zen', u'zero', u'zest', u'zexperience', u'zha', u'zhou', u'zia', u'zihuatenejo', u'zilch', u'zin', u'zinburger', u'zinburgergeist', u'zinc', u'zinfandel', u'zing', u'zip', u'zipcar', u'zipper', u'zippers', u'zipps', u'ziti', u'zoe', u'zombi', u'zombies', u'zone', u'zones', u'zoning', u'zoo', u'zoyo', u'zucca', u'zucchini', u'zuchinni', u'zumba', u'zupa', u'zuzu', u'zwiebel', u'zzed', u'\xe9clairs', u'\xe9cole', u'\xe9m']
# show vectorizer options
vect
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)
# don't convert to lowercase
vect = CountVectorizer(lowercase=False)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape
(3064, 20838)
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape
(3064, 169847)
# last 50 features
print vect.get_feature_names()[-50:]
[u'zone out', u'zone when', u'zones', u'zones dolls', u'zoning', u'zoning issues', u'zoo', u'zoo and', u'zoo is', u'zoo not', u'zoo the', u'zoo ve', u'zoyo', u'zoyo for', u'zucca', u'zucca appetizer', u'zucchini', u'zucchini and', u'zucchini bread', u'zucchini broccoli', u'zucchini carrots', u'zucchini fries', u'zucchini pieces', u'zucchini strips', u'zucchini veal', u'zucchini very', u'zucchini with', u'zuchinni', u'zuchinni again', u'zuchinni the', u'zumba', u'zumba class', u'zumba or', u'zumba yogalates', u'zupa', u'zupa flavors', u'zuzu', u'zuzu in', u'zuzu is', u'zuzu the', u'zwiebel', u'zwiebel kr\xe4uter', u'zzed', u'zzed in', u'\xe9clairs', u'\xe9clairs napoleons', u'\xe9cole', u'\xe9cole len\xf4tre', u'\xe9m', u'\xe9m all']
Predicting the star rating:
# use default options for CountVectorizer
vect = CountVectorizer()
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
# calculate accuracy
print metrics.accuracy_score(y_test, y_pred_class)
0.918786692759
# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())
0.81996086105675148
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
X_train_dtm = vect.fit_transform(X_train)
print 'Features: ', X_train_dtm.shape[1]
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)
Features: 169847 Accuracy: 0.854207436399
# show vectorizer options
vect
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 2), preprocessor=None, stop_words=None, strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)
# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)
Features: 16528 Accuracy: 0.915851272016
# set of stop words
print vect.get_stop_words()
frozenset(['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'four', 'not', 'own', 'through', 'yourselves', 'fify', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'go', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'at', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'your', 'why', 'a', 'off', 'whereby', 'third', 'i', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once'])
# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)
Features: 100 Accuracy: 0.869863013699
# all 100 features
print vect.get_feature_names()
[u'amazing', u'area', u'atmosphere', u'awesome', u'bad', u'bar', u'best', u'better', u'big', u'came', u'cheese', u'chicken', u'clean', u'coffee', u'come', u'day', u'definitely', u'delicious', u'did', u'didn', u'dinner', u'don', u'eat', u'excellent', u'experience', u'favorite', u'feel', u'food', u'free', u'fresh', u'friendly', u'friends', u'going', u'good', u'got', u'great', u'happy', u'home', u'hot', u'hour', u'just', u'know', u'like', u'little', u'll', u'location', u'long', u'looking', u'lot', u'love', u'lunch', u'make', u'meal', u'menu', u'minutes', u'need', u'new', u'nice', u'night', u'order', u'ordered', u'people', u'perfect', u'phoenix', u'pizza', u'place', u'pretty', u'prices', u'really', u'recommend', u'restaurant', u'right', u'said', u'salad', u'sandwich', u'sauce', u'say', u'service', u'staff', u'store', u'sure', u'table', u'thing', u'things', u'think', u'time', u'times', u'took', u'town', u'tried', u'try', u've', u'wait', u'want', u'way', u'went', u'wine', u'work', u'worth', u'years']
# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)
Features: 100000 Accuracy: 0.885518590998
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)
Features: 43957 Accuracy: 0.932485322896
TextBlob: "Simplified Text Processing"
# print the first review
print yelp_best_worst.text[0]
My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure. Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning. It looked like the place fills up pretty quickly so the earlier you get here the better. Do yourself a favor and get their Bloody Mary. It was phenomenal and simply the best I've ever had. I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it. It was amazing. While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious. It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete. It was the best "toast" I've ever had. Anyway, I can't wait to go back!
# save it as a TextBlob object
review = TextBlob(yelp_best_worst.text[0])
# list the words
review.words
WordList(['My', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excellent', 'The', 'weather', 'was', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'grounds', 'an', 'absolute', 'pleasure', 'Our', 'waitress', 'was', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', 'looked', 'like', 'the', 'place', 'fills', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', 'was', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', 'had', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', 'was', 'amazing', 'While', 'EVERYTHING', 'on', 'the', 'menu', 'looks', 'excellent', 'I', 'had', 'the', 'white', 'truffle', 'scrambled', 'eggs', 'vegetable', 'skillet', 'and', 'it', 'was', 'tasty', 'and', 'delicious', 'It', 'came', 'with', '2', 'pieces', 'of', 'their', 'griddled', 'bread', 'with', 'was', 'amazing', 'and', 'it', 'absolutely', 'made', 'the', 'meal', 'complete', 'It', 'was', 'the', 'best', 'toast', 'I', "'ve", 'ever', 'had', 'Anyway', 'I', 'ca', "n't", 'wait', 'to', 'go', 'back'])
# list the sentences
review.sentences
[Sentence("My wife took me here on my birthday for breakfast and it was excellent."), Sentence("The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure."), Sentence("Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning."), Sentence("It looked like the place fills up pretty quickly so the earlier you get here the better."), Sentence("Do yourself a favor and get their Bloody Mary."), Sentence("It was phenomenal and simply the best I've ever had."), Sentence("I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it."), Sentence("It was amazing."), Sentence("While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious."), Sentence("It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete."), Sentence("It was the best "toast" I've ever had."), Sentence("Anyway, I can't wait to go back!")]
# some string methods are available
review.lower()
TextBlob("my wife took me here on my birthday for breakfast and it was excellent. the weather was perfect which made sitting outside overlooking their grounds an absolute pleasure. our waitress was excellent and our food arrived quickly on the semi-busy saturday morning. it looked like the place fills up pretty quickly so the earlier you get here the better. do yourself a favor and get their bloody mary. it was phenomenal and simply the best i've ever had. i'm pretty sure they only use ingredients from their garden and blend them fresh when you order it. it was amazing. while everything on the menu looks excellent, i had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious. it came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete. it was the best "toast" i've ever had. anyway, i can't wait to go back!")
Stemming:
# initialize stemmer
stemmer = SnowballStemmer('english')
# stem each word
print [stemmer.stem(word) for word in review.words]
[u'my', u'wife', u'took', u'me', u'here', u'on', u'my', u'birthday', u'for', u'breakfast', u'and', u'it', u'was', u'excel', u'the', u'weather', u'was', u'perfect', u'which', u'made', u'sit', u'outsid', u'overlook', u'their', u'ground', u'an', u'absolut', u'pleasur', u'our', u'waitress', u'was', u'excel', u'and', u'our', u'food', u'arriv', u'quick', u'on', u'the', u'semi-busi', u'saturday', u'morn', u'it', u'look', u'like', u'the', u'place', u'fill', u'up', u'pretti', u'quick', u'so', u'the', u'earlier', u'you', u'get', u'here', u'the', u'better', u'do', u'yourself', u'a', u'favor', u'and', u'get', u'their', u'bloodi', u'mari', u'it', u'was', u'phenomen', u'and', u'simpli', u'the', u'best', u'i', u've', u'ever', u'had', u'i', u"'m", u'pretti', u'sure', u'they', u'onli', u'use', u'ingredi', u'from', u'their', u'garden', u'and', u'blend', u'them', u'fresh', u'when', u'you', u'order', u'it', u'it', u'was', u'amaz', u'while', u'everyth', u'on', u'the', u'menu', u'look', u'excel', u'i', u'had', u'the', u'white', u'truffl', u'scrambl', u'egg', u'veget', u'skillet', u'and', u'it', u'was', u'tasti', u'and', u'delici', u'it', u'came', u'with', u'2', u'piec', u'of', u'their', u'griddl', u'bread', u'with', u'was', u'amaz', u'and', u'it', u'absolut', u'made', u'the', u'meal', u'complet', u'it', u'was', u'the', u'best', u'toast', u'i', u've', u'ever', u'had', u'anyway', u'i', u'ca', u"n't", u'wait', u'to', u'go', u'back']
Lemmatization
# assume every word is a noun
print [word.lemmatize() for word in review.words]
['My', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', u'wa', 'excellent', 'The', 'weather', u'wa', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', u'ground', 'an', 'absolute', 'pleasure', 'Our', 'waitress', u'wa', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', 'looked', 'like', 'the', 'place', u'fill', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', u'wa', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', 'had', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', u'ingredient', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', u'wa', 'amazing', 'While', 'EVERYTHING', 'on', 'the', 'menu', u'look', 'excellent', 'I', 'had', 'the', 'white', 'truffle', 'scrambled', u'egg', 'vegetable', 'skillet', 'and', 'it', u'wa', 'tasty', 'and', 'delicious', 'It', 'came', 'with', '2', u'piece', 'of', 'their', 'griddled', 'bread', 'with', u'wa', 'amazing', 'and', 'it', 'absolutely', 'made', 'the', 'meal', 'complete', 'It', u'wa', 'the', 'best', 'toast', 'I', "'ve", 'ever', 'had', 'Anyway', 'I', 'ca', "n't", 'wait', 'to', 'go', 'back']
# assume every word is a verb
print [word.lemmatize(pos='v') for word in review.words]
['My', 'wife', u'take', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', u'be', 'excellent', 'The', 'weather', u'be', 'perfect', 'which', u'make', u'sit', 'outside', u'overlook', 'their', u'ground', 'an', 'absolute', 'pleasure', 'Our', 'waitress', u'be', 'excellent', 'and', 'our', 'food', u'arrive', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', u'look', 'like', 'the', 'place', u'fill', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', u'be', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', u'have', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', u'be', u'amaze', 'While', 'EVERYTHING', 'on', 'the', 'menu', u'look', 'excellent', 'I', u'have', 'the', 'white', 'truffle', u'scramble', u'egg', 'vegetable', 'skillet', 'and', 'it', u'be', 'tasty', 'and', 'delicious', 'It', u'come', 'with', '2', u'piece', 'of', 'their', u'griddle', 'bread', 'with', u'be', u'amaze', 'and', 'it', 'absolutely', u'make', 'the', 'meal', 'complete', 'It', u'be', 'the', 'best', 'toast', 'I', "'ve", 'ever', u'have', 'Anyway', 'I', 'ca', "n't", 'wait', 'to', 'go', 'back']
# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
text = unicode(text, 'utf-8').lower()
words = TextBlob(text).words
return [word.lemmatize() for word in words]
# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)
Features: 16452 Accuracy: 0.920743639922
# last 50 features
print vect.get_feature_names()[-50:]
[u'yuyuyummy', u'yuzu', u'z', u'z-grill', u'z11', u'zach', u'zam', u'zanella', u'zankou', u'zappos', u'zatsiki', u'zen', u'zen-like', u'zero', u'zero-star', u'zest', u'zexperience', u'zha', u'zhou', u'zia', u'zilch', u'zin', u'zinburger', u'zinburgergeist', u'zinc', u'zinfandel', u'zing', u'zip', u'zipcar', u'zipper', u'zipps', u'ziti', u'zoe', u'zombi', u'zombie', u'zone', u'zoning', u'zoo', u'zoyo', u'zucca', u'zucchini', u'zuchinni', u'zumba', u'zupa', u'zuzu', u'zwiebel-kr\xe4uter', u'zzed', u'\xe9clairs', u'\xe9cole', u'\xe9m']
# example documents
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
# Term Frequency
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 1 | 1 |
1 | 1 | 1 | 1 | 0 | 0 | 0 |
2 | 0 | 1 | 1 | 2 | 0 | 0 |
# Document Frequency
vect = CountVectorizer(binary=True)
df = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df.reshape(1, 6), columns=vect.get_feature_names())
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 1 | 3 | 2 | 1 | 1 | 1 |
# Term Frequency-Inverse Document Frequency (simple version)
tf/df
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 0 | 0.333333 | 0.0 | 0 | 1 | 1 |
1 | 1 | 0.333333 | 0.5 | 0 | 0 | 0 |
2 | 0 | 0.333333 | 0.5 | 2 | 0 | 0 |
# TfidfVectorizer
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 0.000000 | 0.385372 | 0.000000 | 0.000000 | 0.652491 | 0.652491 |
1 | 0.720333 | 0.425441 | 0.547832 | 0.000000 | 0.000000 | 0.000000 |
2 | 0.000000 | 0.266075 | 0.342620 | 0.901008 | 0.000000 | 0.000000 |
More details: TF-IDF is about what matters
# create a document-term matrix using TF-IDF
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(yelp.text)
features = vect.get_feature_names()
dtm.shape
(10000, 28881)
def summarize():
# choose a random review that is at least 300 characters
review_length = 0
while review_length < 300:
review_id = np.random.randint(0, len(yelp))
review_text = unicode(yelp.text[review_id], 'utf-8')
review_length = len(review_text)
# create a dictionary of words and their TF-IDF scores
word_scores = {}
for word in TextBlob(review_text).words:
word = word.lower()
if word in features:
word_scores[word] = dtm[review_id, features.index(word)]
# print words with the top 5 TF-IDF scores
print 'TOP SCORING WORDS:'
top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for word, score in top_scores:
print word
# print 5 random words
print '\n' + 'RANDOM WORDS:'
random_words = np.random.choice(word_scores.keys(), size=5, replace=False)
for word in random_words:
print word
# print the review
print '\n' + review_text
summarize()
TOP SCORING WORDS: restaurant machaca cactus pickup rental RANDOM WORDS: did pickup hole burrito arrived The place is a hole in the wall right behind the PHX car rental center. Arrived at dinner time, and had the best Mexican food! I had the #5 Machaca Plate w/rice, beans, & one of their homemade tortilla's burrito style. It was just amazing, and so cheap ($5.65). The only issue I have with this place is that its very well used, and looks very dirty, I did find it a bit sticky. Maybe this is a better place for pickup. If you want a clean restaurant, their North restaurant on Cactus Road restaurant is much cleaner.
print review
My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure. Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning. It looked like the place fills up pretty quickly so the earlier you get here the better. Do yourself a favor and get their Bloody Mary. It was phenomenal and simply the best I've ever had. I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it. It was amazing. While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious. It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete. It was the best "toast" I've ever had. Anyway, I can't wait to go back!
# polarity ranges from -1 (most negative) to 1 (most positive)
review.sentiment.polarity
0.40246913580246907
# understanding the apply method
yelp['length'] = yelp.text.apply(len)
yelp.head(1)
business_id | date | review_id | stars | text | type | user_id | cool | useful | funny | length | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9yKzy9PApeiPPOUJEtnvkg | 2011-01-26 | fWKvX83p0-ka4JS3dc6E5A | 5 | My wife took me here on my birthday for breakf... | review | rLtl8ZkDX5vH5nAx9C3q5Q | 2 | 5 | 0 | 889 |
# define a function that accepts text and returns the polarity
def detect_sentiment(text):
return TextBlob(text.decode('utf-8')).sentiment.polarity
# create a new DataFrame column for sentiment (WARNING: SLOW!)
yelp['sentiment'] = yelp.text.apply(detect_sentiment)
# box plot of sentiment grouped by stars
yelp.boxplot(column='sentiment', by='stars')
<matplotlib.axes._subplots.AxesSubplot at 0x1f6f0b00>
# reviews with most positive sentiment
yelp[yelp.sentiment == 1].text.head()
254 Our server Gary was awesome. Food was amazing.... 347 3 syllables for this place. \nA-MAZ-ING!\n\nTh... 420 LOVE the food!!!! 459 Love it!!! Wish we still lived in Arizona as C... 679 Excellent burger Name: text, dtype: object
# reviews with most negative sentiment
yelp[yelp.sentiment == -1].text.head()
773 This was absolutely horrible. I got the suprem... 1517 Nasty workers and over priced trash 3266 Absolutely awful... these guys have NO idea wh... 4766 Very bad food! 5812 I wouldn't send my worst enemy to this place. Name: text, dtype: object
# widen the column display
pd.set_option('max_colwidth', 500)
# negative sentiment in a 5-star review
yelp[(yelp.stars == 5) & (yelp.sentiment < -0.3)].head(1)
business_id | date | review_id | stars | text | type | user_id | cool | useful | funny | length | sentiment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
390 | 106JT5p8e8Chtd0CZpcARw | 2009-08-06 | KowGVoP_gygzdSu6Mt3zKQ | 5 | RIP AZ Coffee Connection. :( I stopped by two days ago unaware that they had closed. I am severely bummed. This place is irreplaceable! Damn you, Starbucks and McDonalds! | review | jKeaOrPyJ-dI9SNeVqrbww | 1 | 0 | 0 | 175 | -0.302083 |
# positive sentiment in a 1-star review
yelp[(yelp.stars == 1) & (yelp.sentiment > 0.5)].head(1)
business_id | date | review_id | stars | text | type | user_id | cool | useful | funny | length | sentiment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1781 | 53YGfwmbW73JhFiemNeyzQ | 2012-06-22 | Gi-4O3EhE175vujbFGDIew | 1 | If you like the stuck up Scottsdale vibe this is a good place for you. The food isn't impressive. Nice outdoor seating. | review | Hqgx3IdJAAaoQjvrUnbNvw | 0 | 1 | 2 | 119 | 0.766667 |
# reset the column display width
pd.reset_option('max_colwidth')
# create a DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
# define X and y
feature_cols = ['text', 'sentiment', 'cool', 'useful', 'funny']
X = yelp_best_worst[feature_cols]
y = yelp_best_worst.stars
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# use CountVectorizer with text column only
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train.text)
X_test_dtm = vect.transform(X_test.text)
print X_train_dtm.shape
print X_test_dtm.shape
(3064, 16825) (1022, 16825)
# shape of other four feature columns
X_train.drop('text', axis=1).shape
(3064, 4)
# cast other feature columns to float and convert to a sparse matrix
extra = sp.sparse.csr_matrix(X_train.drop('text', axis=1).astype(float))
extra.shape
(3064, 4)
# combine sparse matrices
X_train_dtm_extra = sp.sparse.hstack((X_train_dtm, extra))
X_train_dtm_extra.shape
(3064, 16829)
# repeat for testing set
extra = sp.sparse.csr_matrix(X_test.drop('text', axis=1).astype(float))
X_test_dtm_extra = sp.sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape
(1022, 16829)
# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print metrics.accuracy_score(y_test, y_pred_class)
0.917808219178
# use logistic regression with all features
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm_extra, y_train)
y_pred_class = logreg.predict(X_test_dtm_extra)
print metrics.accuracy_score(y_test, y_pred_class)
0.922700587084
# spelling correction
TextBlob('15 minuets late').correct()
TextBlob("15 minutes late")
# spellcheck
Word('parot').spellcheck()
[('part', 0.9929478138222849), (u'parrot', 0.007052186177715092)]
# definitions
Word('bank').define('v')
[u'tip laterally', u'enclose with a bank', u'do business with a bank or keep an account at a bank', u'act as the banker in a game or in gambling', u'be in the banking business', u'put into a bank account', u'cover with ashes so to control the rate of burning', u'have confidence or faith in']
# language identification
TextBlob('Hola amigos').detect_language()
u'es'