#!/usr/bin/env python
# coding: utf-8
# # Case study
# Given a user’s past reviews on Yelp (available from yelp-challenge dataset),
#
# When the user writes a review for a business she hasn't reviewed before,
#
# How likely will it be a Five-Star review?
#
# - Load data
# - Visualize the data
# - Featurize the data
# - Join tables to populate the features
# - Model the data: Logistic regression
# - Evaluate the model
# - Make prediction with the model
# # Load data
# In[9]:
import pandas as pd
PATH = '/scratch/xun/docs/yelp_dataset_challenge_academic_dataset/'
biz_df = pd.read_csv(PATH + 'yelp_academic_dataset_business.csv')
user_df = pd.read_csv(PATH + 'yelp_academic_dataset_user.csv')
review_df = pd.read_csv(PATH + 'yelp_academic_dataset_review.csv')
# In[10]:
review_df = review_df.set_index('review_id')
user_df = user_df.set_index('user_id')
biz_df = biz_df.set_index('business_id')
# # Visualize the data
# ## Example: Plot distribution of review star ratings
# In[11]:
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
# Set context to "talk" for figure aesthetics
sns.set_context(context="talk")
# set plot figure size to larger
sns.set(palette='Set2', rc={"figure.figsize": (15, 8)}, style="ticks")
# In[12]:
ax = sns.countplot(x='stars', data=review_df, hue='type')
# Removing spines
sns.despine()
# ## Example: Plot review star ratings by year
# In[13]:
review_df['datetime'] = pd.to_datetime(review_df['date'])
review_df['year'] = review_df['datetime'].dt.year
ax = sns.countplot(x='year', data=review_df, hue='stars')
sns.despine()
# # Featurize the data
#
# - Convert date string to date delta
# - For example, business_age
# - Convert strings to categorical features
# - For example, noise level: quiet, loud, very loud.
# - Drop unused features
# - For example, business_name
# In[14]:
def calculate_date_delta(df, from_column, to_column):
datetime = pd.to_datetime(df[from_column])
time_delta = datetime.max() - datetime
df[to_column] = time_delta.apply(lambda x: x.days)
df.drop(from_column, axis=1, inplace=True)
# In[15]:
def to_length(df, from_column, to_column):
df[to_column] = df[from_column].apply(lambda x: len(x))
df.drop(from_column, axis=1, inplace=True)
# In[16]:
def drop_columns(df, columns):
for column in columns:
df.drop(column, axis=1, inplace=True)
# In[17]:
def to_boolean(df, columns):
for column in columns:
to_column = column+'_bool'
df[to_column] = df[column].apply(lambda x: bool(x))
df.drop(column, axis=1, inplace=True)
# In[18]:
FILL_WITH = 0.0
# In[19]:
def to_category(df, columns):
for column in columns:
df[column] = df[column].astype('category')
# add FILL_WITH category for fillna() to work w/o error
if (FILL_WITH not in df[column].cat.categories):
df[column] = df[column].cat.add_categories([FILL_WITH])
#print 'categories for ', column, ' include ', df[column].cat.categories
# In[20]:
def category_rename_to_int(df, columns):
for column in columns:
df[column].cat.remove_unused_categories()
size = len(df[column].cat.categories)
#print 'column ', column, ' has ', size, ' columns, include ', df[column].cat.categories
df[column] = df[column].cat.rename_categories(range(1, size+1))
#print 'becomes ', df[column].cat.categories
# In[21]:
calculate_date_delta(df=review_df, from_column='date', to_column='date_delta')
# In[22]:
to_length(df=review_df, from_column='text', to_column='text_len')
# In[23]:
drop_columns(df=review_df, columns=['type', 'year', 'datetime'])
# In[24]:
review_df.fillna(value=0.0, inplace=True)
# In[25]:
calculate_date_delta(df=user_df, from_column='yelping_since', to_column='date_delta')
# In[26]:
to_length(df=user_df, from_column='friends', to_column='friends_count')
# In[27]:
to_length(df=user_df, from_column='elite', to_column='elite_count')
# In[28]:
drop_columns(df=user_df, columns=['name', 'type'])
# In[29]:
user_df.fillna(value=0.0, inplace=True)
# In[30]:
drop_columns(
df=biz_df,
columns=[
'type',
'name',
'city',
'full_address',
'state',
'categories',
'longitude',
'latitude',
'neighborhoods',
'hours.Monday.open',
'hours.Monday.close',
'hours.Tuesday.open',
'hours.Tuesday.close',
'hours.Wednesday.open',
'hours.Wednesday.close',
'hours.Thursday.open',
'hours.Thursday.close',
'hours.Friday.open',
'hours.Friday.close',
'hours.Saturday.open',
'hours.Saturday.close',
'hours.Sunday.open',
'hours.Sunday.close',
]
)
# In[31]:
to_cat_columns = [
'attributes.Ambience.casual',
'attributes.Attire',
'attributes.Alcohol',
'attributes.Noise Level',
'attributes.Smoking',
'attributes.Wi-Fi',
'attributes.Ages Allowed',
'attributes.BYOB/Corkage',
]
to_category(
df=biz_df,
columns=to_cat_columns,
)
# In[32]:
biz_df.fillna(value=FILL_WITH, inplace=True)
# In[33]:
category_rename_to_int(
df=biz_df,
columns=to_cat_columns,
)
# # Join tables to populate the features
#
# Join three tables (review, biz, user) to one (review-with-all-info).
# Each join is a many-to-one join.
# In[34]:
# The `user_df` DataFrame is already indexed by the join key (`user_id`). Make sure it's on the right side of join.
review_join_user = review_df.join(user_df, on='user_id', lsuffix='_review', rsuffix='_user')
# In[35]:
review_join_user_join_biz = review_join_user.join(biz_df, on='business_id', rsuffix='_biz')
# In[36]:
drop_columns(df=review_join_user_join_biz, columns=['user_id', 'business_id'])
# # Identify data X and target y
# Data X: all features we gathered from business, user, and review tables.
#
# Target y: what we'd like to predict: Whether the review is Five-star or not.
# In[37]:
# target y is whether a review is five-star
y = review_join_user_join_biz.stars.apply(lambda x: x == 5)
# We've already dropped not informative features data X
X = review_join_user_join_biz
review_join_user_join_biz.drop('stars', axis=1, inplace=True)
# get the feature names - this will be useful for the model visualization and feature analysis
features = X.columns.values
# # Split training set and testing set
# In[38]:
from sklearn.cross_validation import train_test_split
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y)
# In[39]:
print 'training data shape', X_train.shape
print 'test data shape', X_test.shape
print 'converted label data shape', y_train.shape
#print 'features', features
# # Model the data: Logistic regression
#
# Logistic regression estimates the probability of a binary response based on one or more features.
#
# Here we estimate the probability of a review being five-star.
# In[40]:
from sklearn import preprocessing
# Standardize features by removing the mean and scaling to unit variance
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# In[41]:
from sklearn.cross_validation import cross_val_score
import numpy as np
# Function used to print cross-validation scores
def training_score(est, X, y, cv):
acc = cross_val_score(est, X, y, cv = cv, scoring='accuracy')
roc = cross_val_score(est, X, y, cv = cv, scoring='roc_auc')
print '5-fold Train CV | Accuracy:', round(np.mean(acc), 3),'+/-', \
round(np.std(acc), 3),'| ROC AUC:', round(np.mean(roc), 3), '+/-', round(np.std(roc), 3)
# In[42]:
from sklearn import linear_model
# Build model using default parameter values
lrc = linear_model.LogisticRegression()
# In[43]:
from sklearn.cross_validation import StratifiedKFold
# cross-validation
cv = StratifiedKFold(y_train, n_folds=5, shuffle=True)
# In[44]:
# print cross-validation scores
training_score(est=lrc, X=X_train_scaled, y=y_train, cv=cv)
# # Evaluation via Confusion Matrix
#
# False positive (upper right).
#
# False negative (bottom left)
# In[45]:
# Compute confusion matrix
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Run classifier
lrc_fit = lrc.fit(X_train_scaled, y_train)
y_pred = lrc_fit.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
# In[46]:
# Normalize the confusion matrix by row (i.e by the number of samples in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure(figsize=(8, 8))
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
plt.show()
# # Make prediction with the model
#
# Randomly pick an anonymous user from the dataset.
#
# Example: Predict whether the user will give a business a Five-star rating.
# In[47]:
def predict_given_user_biz(user, biz, biz_name):
a_user = user.copy()
a_biz = biz.copy()
for column in {
'review_count',
'stars',
}:
a_biz[column+'_biz'] = a_biz[column]
a_biz.drop(column, axis=1, inplace=True)
for column in {
'date_delta',
'votes.funny',
'votes.useful',
'votes.cool',
}:
a_user[column+'_user'] = a_user[column]
a_user.drop(column, axis=1, inplace=True)
a_X_test = pd.concat([a_user.iloc[0], a_biz.iloc[0]], axis=0).to_frame().transpose()
a_X_test['review_id'] = 'xun_on_postino-arcadia-phoenix'
a_X_test = a_X_test.set_index('review_id')
for column in {
'votes.cool_review',
'votes.funny_review',
'votes.useful_review',
'date_delta_review',
'text_len',
}:
a_X_test[column] = ""
a_X_test[column] = '0.0'
# XXX(xun): fix this
a_X_test['attributes.BYOB/Corkage'] = 4
#a_X_test
a_X_test_scaled = scaler.transform(a_X_test)
a_y_pred = lrc_fit.predict(a_X_test_scaled)
print 'prediction for user ', a_user.index.values[0], ' on business ', biz_name, ' is ', a_y_pred[0]
# In[48]:
a_user = user_df[user_df.index == 'HcOguFNyg9jNkNpTBD2D3g']
a_user.review_count
# In[49]:
a_biz = biz_df[biz_df.index == 'SDwYQ6eSu1htn8vHWv128g']
# # https://www.yelp.com/biz/postino-arcadia-phoenix
#
#
#
# In[57]:
predict_given_user_biz(user=a_user, biz=a_biz, biz_name="Postino Arcadia Phoenix")
# In[58]:
another_user = user_df[user_df.index == 'o625WyBtvJ_G3s0FRr6RmQ']
another_user.review_count
# In[51]:
another_biz = biz_df[biz_df.index == '1n0n_-Iz0e3iVpH8sereiA']
# # https://www.yelp.com/biz/port-authority-of-allegheny-county-pittsburgh
#
#
# In[52]:
predict_given_user_biz(user=a_user, biz=another_biz, biz_name="Postino Arcadia Phoenix")
# In[54]:
predict_given_user_biz(user=another_user, biz=another_biz, biz_name="Port Authority of Allegheny County")