%matplotlib inline from collections import defaultdict import json import numpy as np import scipy as sp import matplotlib.pyplot as plt import pandas as pd from matplotlib import rcParams import matplotlib.cm as cm import matplotlib as mpl #colorbrewer2 Dark2 qualitative color table dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) fulldf=pd.read_csv("bigdf.csv") fulldf.head(2) 'stars': (star rating, integer 1-5), 'date': (date, formatted like '2011-04-19'), 'review_id': (unique id for the review). 'business_id': (a unique identifier for this business), 'biz_name': (the full business name), 'latitude': (latitude), 'longitude': (longitude), 'business_review_count': (review count for the restaurant[this is a repeated field for all reviews of the restaurant]), 'categories': [(localized category names)], 'business_avg': (average stars over all users reviews for business[this is a repeated field for all reviews of the restaurant]). 'user_id': (unique user identifier), 'user_name': (first name, last initial, like 'Matt J.'), 'user_review_count': (count of restaurants reviewed by user[this is a repeated field for all reviews by the user]), 'user_avg': (floating point average of users reviews over all businesses, like 4.31[this is a repeated field for all reviews by the user]). #your code here urc=fulldf.groupby('user_id').review_id.count() ax=urc.hist(bins=50, log=True) remove_border(ax) plt.xlabel("Reviews per user") plt.grid(False) plt.grid(axis = 'y', color ='white', linestyle='-') plt.title("Review Count per User"); #your code here brc=fulldf.groupby('business_id').review_id.count() ax=brc.hist(bins=50, log=True) remove_border(ax) plt.xlabel("Reviews per restaurant") plt.grid(False) plt.grid(axis = 'y', color ='white', linestyle='-') plt.title("Review Count per Restaurant"); #your code here print "Number of Reviews",fulldf.shape[0] print "Number of Users", fulldf.user_id.unique().shape[0], "Number of Businesses", fulldf.business_id.unique().shape[0] #your code here print "Mean stars over all reviews:",fulldf.stars.mean() stars=fulldf.stars ax=stars.hist(bins=5) remove_border(ax) plt.xlabel("Star rating") plt.grid(False) plt.grid(axis = 'y', color ='white', linestyle='-') plt.title("Star ratings over all reviews"); def recompute_frame(ldf): """ takes a dataframe ldf, makes a copy of it, and returns the copy with all averages and review counts recomputed this is used when a frame is subsetted. """ ldfu=ldf.groupby('user_id') ldfb=ldf.groupby('business_id') user_avg=ldfu.stars.mean() user_review_count=ldfu.review_id.count() business_avg=ldfb.stars.mean() business_review_count=ldfb.review_id.count() nldf=ldf.copy() nldf.set_index(['business_id'], inplace=True) nldf['business_avg']=business_avg nldf['business_review_count']=business_review_count nldf.reset_index(inplace=True) nldf.set_index(['user_id'], inplace=True) nldf['user_avg']=user_avg nldf['user_review_count']=user_review_count nldf.reset_index(inplace=True) return nldf #your code here smallidf=fulldf[(fulldf.user_review_count > 60) & (fulldf.business_review_count > 150)] smalldf=recompute_frame(smallidf) #your code here print "Total Number of Reviews", smalldf.shape[0] print "Users in this set", smalldf.user_id.unique().shape[0], "Restaurants",smalldf.business_id.unique().shape[0] plt.figure() ax=smalldf.groupby('user_id').review_id.count().hist() remove_border(ax) plt.xlabel("Reviews per user") plt.grid(False) plt.grid(axis = 'y', color ='white', linestyle='-') plt.figure() ax=smalldf.groupby('business_id').review_id.count().hist() remove_border(ax) plt.xlabel("Reviews per restaurant") plt.grid(False) plt.grid(axis = 'y', color ='white', linestyle='-') #your code here plt.figure() avg_ratings_by_user=smalldf.groupby('user_id').stars.mean() ax=avg_ratings_by_user.hist() remove_border(ax) plt.xlabel("Average review score") plt.grid(False) plt.grid(axis = 'y', color ='white', linestyle='-') plt.title("Average User Rating") plt.figure() avg_ratings_by_biz=smalldf.groupby('business_id').stars.mean() ax=avg_ratings_by_biz.hist() remove_border(ax) plt.xlabel("Average review score") plt.grid(False) plt.grid(axis = 'y', color ='white', linestyle='-') plt.title("Average Restaurant Rating") plt.figure() print smalldf.stars.mean() plt.figure() restaurants=smalldf.business_id.unique() supports=[] for i,rest1 in enumerate(restaurants): for j,rest2 in enumerate(restaurants): if i < j: rest1_reviewers = smalldf[smalldf.business_id==rest1].user_id.unique() rest2_reviewers = smalldf[smalldf.business_id==rest2].user_id.unique() common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers) supports.append(len(common_reviewers)) print "Mean support is:",np.mean(supports) plt.hist(supports) from scipy.stats.stats import pearsonr def pearson_sim(rest1_reviews, rest2_reviews, n_common): """ Given a subframe of restaurant 1 reviews and a subframe of restaurant 2 reviews, where the reviewers are those who have reviewed both restaurants, return the pearson correlation coefficient between the user average subtracted ratings. The case for zero common reviewers is handled separately. Its ok to return a NaN if any of the individual variances are 0. """ if n_common==0: rho=0. else: diff1=rest1_reviews['stars']-rest1_reviews['user_avg'] diff2=rest2_reviews['stars']-rest2_reviews['user_avg'] rho=pearsonr(diff1, diff2)[0] return rho def get_restaurant_reviews(restaurant_id, df, set_of_users): """ given a resturant id and a set of reviewers, return the sub-dataframe of their reviews. """ mask = (df.user_id.isin(set_of_users)) & (df.business_id==restaurant_id) reviews = df[mask] reviews = reviews[reviews.user_id.duplicated()==False] return reviews """ Function -------- calculate_similarity Parameters ---------- rest1 : string The id of restaurant 1 rest2 : string The id of restaurant 2 df : DataFrame A dataframe of reviews, such as the smalldf above similarity_func : func A function like pearson_sim above which takes two dataframes of individual restaurant reviews made by a common set of reviewers, and the number of common reviews. This function returns the similarity of the two restaurants based on the common reviews. Returns -------- A tuple The first element of the tuple is the similarity and the second the common support n_common. If the similarity is a NaN, set it to 0 """ #your code here def calculate_similarity(rest1, rest2, df, similarity_func): # find common reviewers rest1_reviewers = df[df.business_id==rest1].user_id.unique() rest2_reviewers = df[df.business_id==rest2].user_id.unique() common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers) n_common=len(common_reviewers) #get reviews rest1_reviews = get_restaurant_reviews(rest1, df, common_reviewers) rest2_reviews = get_restaurant_reviews(rest2, df, common_reviewers) sim=similarity_func(rest1_reviews, rest2_reviews, n_common) if np.isnan(sim): return 0, n_common return sim, n_common class Database: "A class representing a database of similaries and common supports" def __init__(self, df): "the constructor, takes a reviews dataframe like smalldf as its argument" database={} self.df=df self.uniquebizids={v:k for (k,v) in enumerate(df.business_id.unique())} keys=self.uniquebizids.keys() l_keys=len(keys) self.database_sim=np.zeros([l_keys,l_keys]) self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int) def populate_by_calculating(self, similarity_func): """ a populator for every pair of businesses in df. takes similarity_func like pearson_sim as argument """ items=self.uniquebizids.items() for b1, i1 in items: for b2, i2 in items: if i1 < i2: sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func) self.database_sim[i1][i2]=sim self.database_sim[i2][i1]=sim self.database_sup[i1][i2]=nsup self.database_sup[i2][i1]=nsup elif i1==i2: nsup=self.df[self.df.business_id==b1].user_id.count() self.database_sim[i1][i1]=1. self.database_sup[i1][i1]=nsup def get(self, b1, b2): "returns a tuple of similarity,common_support given two business ids" sim=self.database_sim[self.uniquebizids[b1]][self.uniquebizids[b2]] nsup=self.database_sup[self.uniquebizids[b1]][self.uniquebizids[b2]] return (sim, nsup) db=Database(smalldf) db.populate_by_calculating(pearson_sim) db.get("z3yFuLVrmH-3RJruPEMYKw", "zruUQvFySeXyEd7_rQixBg") def shrunk_sim(sim, n_common, reg=3.): "takes a similarity and shrinks it down by using the regularizer" ssim=(n_common*sim)/(n_common+reg) return ssim """ Function -------- knearest Parameters ---------- restaurant_id : string The id of the restaurant whose nearest neighbors we want set_of_restaurants : array The set of restaurants from which we want to find the nearest neighbors dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businessed. e.g. dbase.get(rid1,rid2) k : int the number of nearest neighbors desired, default 7 reg: float the regularization. Returns -------- A sorted list of the top k similar restaurants. The list is a list of tuples (business_id, shrunken similarity, common support). """ #your code here from operator import itemgetter def knearest(restaurant_id, set_of_restaurants, dbase, k=7, reg=3.): """ Given a restaurant_id, dataframe, and database, get a sorted list of the k most similar restaurants from the entire database. """ similars=[] for other_rest_id in set_of_restaurants: if other_rest_id!=restaurant_id: sim, nc=dbase.get(restaurant_id, other_rest_id) ssim=shrunk_sim(sim, nc, reg=reg) similars.append((other_rest_id, ssim, nc )) similars=sorted(similars, key=itemgetter(1), reverse=True) return similars[0:k] testbizid="eIxSLxzIlfExI6vgAbn2JA" testbizid2="L-uPZxooP_ziXCtRrWi8Pw" def biznamefromid(df, theid): return df['biz_name'][df['business_id']==theid].values[0] def usernamefromid(df, theid): return df['user_name'][df['user_id']==theid].values[0] print testbizid, biznamefromid(smalldf,testbizid) print testbizid2, biznamefromid(smalldf, testbizid2) tops=knearest(testbizid, smalldf.business_id.unique(), db, k=7, reg=3.) print "For ",biznamefromid(smalldf, testbizid), ", top matches are:" for i, (biz_id, sim, nc) in enumerate(tops): print i,biznamefromid(smalldf,biz_id), "| Sim", sim, "| Support",nc tops2=knearest(testbizid2, smalldf.business_id.unique(), db, k=7, reg=3.) print "For ",biznamefromid(smalldf, testbizid2), ", top matches are:" for i, (biz_id, sim, nc) in enumerate(tops2): print i,biznamefromid(smalldf,biz_id), "| Sim", sim, "| Support",nc def get_user_top_choices(user_id, df, numchoices=5): "get the sorted top 5 restaurants for a user by the star rating the user gave them" udf=df[df.user_id==user_id][['business_id','stars']].sort(['stars'], ascending=False).head(numchoices) return udf testuserid="7cR92zkDv4W3kqzii6axvg" print "For user", usernamefromid(smalldf,testuserid), "top choices are:" bizs=get_user_top_choices(testuserid, smalldf)['business_id'].values [biznamefromid(smalldf, biz_id) for biz_id in bizs] """ Function -------- get_top_recos_for_user Parameters ---------- userid : string The id of the user for whom we want the top recommendations df : Dataframe The dataframe of restaurant reviews such as smalldf dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businesses. e.g. dbase.get(rid1,rid2) n: int the n top choices of the user by star rating k : int the number of nearest neighbors desired, default 8 reg: float the regularization. Returns -------- A sorted list of the top recommendations. The list is a list of tuples (business_id, business_avg). You are combining the k-nearest recommendations for each of the user's n top choices, removing duplicates and the ones the user has already rated. """ #your code here def get_top_recos_for_user(userid, df, dbase, n=5, k=7, reg=3.): bizs=get_user_top_choices(userid, df, numchoices=n)['business_id'].values rated_by_user=df[df.user_id==userid].business_id.values tops=[] for ele in bizs: t=knearest(ele, df.business_id.unique(), dbase, k=k, reg=reg) for e in t: if e[0] not in rated_by_user: tops.append(e) #there might be repeats. unique it ids=[e[0] for e in tops] uids={k:0 for k in list(set(ids))} topsu=[] for e in tops: if uids[e[0]] == 0: topsu.append(e) uids[e[0]] =1 topsr=[] for r, s,nc in topsu: avg_rate=df[df.business_id==r].stars.mean() topsr.append((r,avg_rate)) topsr=sorted(topsr, key=itemgetter(1), reverse=True) if n < len(topsr): return topsr[0:n] else: return topsr print "For user", usernamefromid(smalldf,testuserid), "the top recommendations are:" toprecos=get_top_recos_for_user(testuserid, smalldf, db, n=5, k=7, reg=3.) for biz_id, biz_avg in toprecos: print biznamefromid(smalldf,biz_id), "| Average Rating |", biz_avg """ Function -------- knearest_amongst_userrated Parameters ---------- restaurant_id : string The id of the restaurant whose nearest neighbors we want user_id : string The id of the user, in whose reviewed restaurants we want to find the neighbors df: Dataframe The dataframe of reviews such as smalldf dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businessed. e.g. dbase.get(rid1,rid2) k : int the number of nearest neighbors desired, default 7 reg: float the regularization. Returns -------- A sorted list of the top k similar restaurants. The list is a list of tuples (business_id, shrunken similarity, common support). """ #your code here def knearest_amongst_userrated(restaurant_id, user_id, df, dbase, k=7, reg=3.): dfuser=df[df.user_id==user_id] bizsuserhasrated=dfuser.business_id.unique() return knearest(restaurant_id, bizsuserhasrated, dbase, k=k, reg=reg) """ Function -------- rating Parameters ---------- df: Dataframe The dataframe of reviews such as smalldf dbase : instance of Database class. A database of similarities, on which the get method can be used to get the similarity of two businessed. e.g. dbase.get(rid1,rid2) restaurant_id : string The id of the restaurant whose nearest neighbors we want user_id : string The id of the user, in whose reviewed restaurants we want to find the neighbors k : int the number of nearest neighbors desired, default 7 reg: float the regularization. Returns -------- A float which is the impued rating that we predict that user_id will make for restaurant_id """ #your code here def rating(df, dbase, restaurant_id, user_id, k=7, reg=3.): mu=df.stars.mean() users_reviews=df[df.user_id==user_id] nsum=0. scoresum=0. nears=knearest_amongst_userrated(restaurant_id, user_id, df, dbase, k=k, reg=reg) restaurant_mean=df[df.business_id==restaurant_id].business_avg.values[0] user_mean=users_reviews.user_avg.values[0] scores=[] for r,s,nc in nears: scoresum=scoresum+s scores.append(s) r_reviews_row=users_reviews[users_reviews['business_id']==r] r_stars=r_reviews_row.stars.values[0] r_avg=r_reviews_row.business_avg.values[0] rminusb=(r_stars - (r_avg + user_mean - mu)) nsum=nsum+s*rminusb baseline=(user_mean +restaurant_mean - mu) #we might have nears, but there might be no commons, giving us a pearson of 0 if scoresum > 0.: val = nsum/scoresum + baseline else: val=baseline return val print "User Average", smalldf[smalldf.user_id==testuserid].stars.mean(),"for",usernamefromid(smalldf,testuserid) print "Predicted ratings for top choices calculated earlier:" for biz_id,biz_avg in toprecos: print biznamefromid(smalldf, biz_id),"|",rating(smalldf, db, biz_id, testuserid, k=7, reg=3.),"|","Average",biz_avg def get_other_ratings(restaurant_id, user_id, df): "get a user's rating for a restaurant and the restaurant's average rating" choice=df[(df.business_id==restaurant_id) & (df.user_id==user_id)] users_score=choice.stars.values[0] average_score=choice.business_avg.values[0] return users_score, average_score print "for user",usernamefromid(smalldf,testuserid), 'avg', smalldf[smalldf.user_id==testuserid].stars.mean() for biz_id in bizs: print "----------------------------------" print biznamefromid(smalldf, biz_id) print "Predicted Rating:",rating(smalldf, db, biz_id, testuserid, k=7, reg=3.) u,a=get_other_ratings(biz_id, testuserid, smalldf) print "Actual User Rating:",u,"Avg Rating",a def compare_results(stars_actual, stars_predicted, ylow=-10, yhigh=15, title=""): """ plot predicted results against actual results. Takes 2 arguments: a numpy array of actual ratings and a numpy array of predicted ratings scatterplots the predictions, a unit slope line, line segments joining the mean, and a filled in area of the standard deviations." """ fig=plt.figure() df=pd.DataFrame(dict(actual=stars_actual, predicted=stars_predicted)) ax=plt.scatter(df.actual, df.predicted, alpha=0.2, s=30, label="predicted") plt.ylim([ylow,yhigh]) plt.plot([1,5],[1,5], label="slope 1") xp=[1,2,3,4,5] yp=df.groupby('actual').predicted.mean().values plt.plot(xp,yp,'k', label="means") sig=df.groupby('actual').predicted.std().values plt.fill_between(xp, yp - sig, yp + sig, color='k', alpha=0.2) plt.xlabel("actual") plt.ylabel("predicted") plt.legend(frameon=False) remove_border() plt.grid(False) plt.title(title) print "fraction between -15 and 15 rating", np.mean(np.abs(df.predicted) < 15) #your code here def make_results_plot(df,k,reg): uid=smalldf.user_id.values bid=smalldf.business_id.values actual=smalldf.stars.values predicted=np.zeros(len(actual)) counter=0 for user_id, biz_id in zip(uid,bid): predicted[counter]=rating(smalldf, db, biz_id, user_id, k=k, reg=reg) counter=counter+1 compare_results(actual, predicted) #your code here print "k=3, reg=3." make_results_plot(smalldf,3,3.) plt.title("k=3, reg=3.") print "k=3, reg=15." make_results_plot(smalldf,3,15.,) plt.title("k=3, reg=15.") print "k=10, reg=3." make_results_plot(smalldf,10,3.) plt.title("k=10, reg=3.") print "k=10, reg=15." make_results_plot(smalldf,10,15.,) plt.title("k=10, reg=15.") def knearest_pos(restaurant_id, set_of_restaurants, dbase, k=7, reg=3.): """ Given a restaurant_id, dataframe, and database, get a sorted list of the k most similar restaurants from the entire database. """ similars=[] for other_rest_id in set_of_restaurants: if other_rest_id!=restaurant_id: sim, nc=dbase.get(restaurant_id, other_rest_id) ssim=shrunk_sim(sim, nc, reg=reg) similars.append((other_rest_id, ssim/2.0 + float(nc)/(float(nc)+reg), nc )) similars=sorted(similars, key=itemgetter(1), reverse=True) return similars[0:k] def knearest_amongst_userrated_pos(restaurant_id, user_id, df, dbase, k=7, reg=3.): dfuser=df[df.user_id==user_id] bizsuserhasrated=dfuser.business_id.unique() return knearest_pos(restaurant_id, bizsuserhasrated, dbase, k=k, reg=reg) def rating_pos(df, dbase, restaurant_id, user_id, k=7, reg=3.): mu=df.stars.mean() users_reviews=df[df.user_id==user_id] nsum=0. scoresum=0. nears=knearest_amongst_userrated_pos(restaurant_id, user_id, df, dbase, k=k, reg=reg) restaurant_mean=df[df.business_id==restaurant_id].business_avg.values[0] user_mean=users_reviews.user_avg.values[0] scores=[] for r,sold,nc in nears: s=sold/2.0 shrink_factor=float(nc)/(float(nc)+reg) s=s+shrink_factor/2.0 scoresum=scoresum+s scores.append(s) r_reviews_row=users_reviews[users_reviews['business_id']==r] r_stars=r_reviews_row.stars.values[0] r_avg=r_reviews_row.business_avg.values[0] rminusb=(r_stars - (r_avg + user_mean - mu)) nsum=nsum+s*rminusb baseline=(user_mean +restaurant_mean - mu) #we might have nears, but there might be no commons, giving us a pearson of 0 if scoresum > 0.: val = nsum/scoresum + baseline else: val=baseline return val def make_results_plot_pos(df,k,reg): uid=smalldf.user_id.values bid=smalldf.business_id.values actual=smalldf.stars.values predicted=np.zeros(len(actual)) counter=0 for user_id, biz_id in zip(uid,bid): predicted[counter]=rating_pos(smalldf, db, biz_id, user_id, k=k, reg=reg) counter=counter+1 compare_results(actual, predicted, ylow=1, yhigh=5) print "k=2, reg=1." make_results_plot_pos(smalldf,2,1.) plt.title("k=2, reg=1.") print "k=2, reg=15." make_results_plot_pos(smalldf,2,15.,) plt.title("k=2, reg=15.") print "k=15, reg=1." make_results_plot_pos(smalldf,15,1.) plt.title("k=15, reg=1.") print "k=15, reg=15." make_results_plot_pos(smalldf,15,15.,) plt.title("k=15, reg=15.") """ Function -------- gamma_m_draw Draw a single sample from the conditional posterior distribution of gamma_m. Inputs ------- X_m: A g-by-L+1 matrix, defined above. Y_m: A 1D vector of length g, defined above. sig2: Residual _variance_, as defined above. Lambda_gamma: Prior precision matrix. Outputs -------- Single draw from conditional posterior, defined above. """ #Item-specific parameters given all else #your code here def gamma_m_draw(X_m, Y_m, sig2, Lambda_gamma): #Compute matrices that define conditional posterior. Q_m_inv = np.linalg.inv(np.dot(X_m.T, X_m)/sig2+Lambda_gamma) XtY = np.dot(X_m.T, Y_m) #Draw item-specific parameters. return np.random.multivariate_normal(np.dot(Q_m_inv, XtY)/sig2, Q_m_inv) """ Function -------- theta_u_draw Draw a single sample from the conditional posterior distribution of gamma_m. Inputs ------- X_u: A g-by-L+1 matrix, defined above. Y_u: A 1D vector of length g, defined above. sig2: Residual _variance_, as defined above. Lambda_theta: Prior precision matrix. Outputs -------- Single draw from conditional posterior, defined above. """ #User-specific parameters given all else #your code here def theta_u_draw(X_u, Y_u, sig2, Lambda_theta): #Compute matrices that define conditional posterior. Q_u_inv = np.linalg.inv(np.dot(X_u.T, X_u)/sig2+Lambda_theta) XtY = np.dot(X_u.T, Y_u) #Draw the user-specific parameters return np.random.multivariate_normal(np.dot(Q_u_inv, XtY)/sig2, Q_u_inv) """ Function -------- factor_gibbs Runs a gibbs sampler to infer mean, variance, user-specific, and item-specific parameters. Inputs ------- data: A dataframe containing ratings data. L: Dimension of latent factors. maxit: Number of samples to draw from posterior. Lambda_theta_diag: Hyperparameter controlling regularization of Theta. Lambda_gamma_diag: Hyperparameter controlling regularization of Gamma. progress: if true, print iteration number every 100 iterations. Outputs -------- Dictionary with elements mu: Draws of mu. 1D array of length maxiter. sig2: Draws of sig2, residual _variance_. 1D array of length maxiter. theta: Draws of Theta. U-by-L-by-maxiter array. gamma: Draws of Gamma. M-by-L-by-maxiter array. EY: Draws of fitted values of Y. N-by-maxiter array. """ def factor_gibbs(data, L, maxit, Lambda_theta_diag, Lambda_gamma_diag, progress=True): data = data.copy() N = data.shape[0] #Create indices that allow us to map users and restaurants to rows #in parameter vectors. uusers, uidx = np.unique(data.user_id, return_inverse=True) uitems, midx = np.unique(data.business_id, return_inverse=True) nusers = uusers.size nitems = uitems.size #Add numerical indices to dataframe. data["uidx"] = uidx data["midx"] = midx #Group observations by user and by business. ugroups = data.groupby("uidx") mgroups = data.groupby("midx") all_avg = data.stars.mean() u_avg = ugroups.stars.mean() m_avg = mgroups.stars.mean() #Initialize parameters and set up data structures for #holding draws. #Overall mean mu = all_avg mu_draws = np.zeros(maxit) #Residual variance sig2 = 0.5 sig2_draws = np.zeros(maxit) #Matrix of user-specific bias and L latent factors. theta = np.zeros([nusers, L+1]) theta[:,0] = u_avg-all_avg theta_draws = np.zeros([nusers, L+1, maxit]) #Matrix of item-specific bias and L latent factors. gamma = np.zeros([nitems, L+1]) gamma[:,0] = m_avg-all_avg gamma_draws = np.zeros([nitems, L+1, maxit]) #Matrix for holding the expected number of stars #for each observation at each draw from the posterior. EY_draws = np.zeros([data.shape[0], maxit]) #Inverse covariance matrices from the prior on each theta_u #and gamma_b. These are diagonal, like Ridge regression. Lambda_theta = np.eye(L+1)*Lambda_theta_diag Lambda_gamma = np.eye(L+1)*Lambda_gamma_diag #Main sampler code for i in range(maxit): if i%100==0 and progress: print i #The entire regression equation except for the overall mean. nomu = np.sum(theta[data.uidx,1:]*gamma[data.midx,1:], axis=1) +\ theta[data.uidx,0] + gamma[data.midx,0] #Compute the expectation of each observation given the current #parameter values. EY_draws[:,i]=mu+nomu #Draw overall mean from a normal distribution mu = np.random.normal(np.mean(data.stars-nomu), np.sqrt(sig2/N)) #Draw overall residual variance from a scaled inverse-Chi squared distribution. sig2 = np.sum(np.power(data.stars-nomu-mu,2))/np.random.chisquare(N-2) #For each item for mi,itemdf in mgroups: #Gather relevant observations, and subtract out overall mean and #user-specific biases, which we are holding fixed. Y_m = itemdf.stars-mu-theta[itemdf.uidx,0] #Build the regression design matrix implied by holding user factors #fixed. X_m = np.hstack((np.ones([itemdf.shape[0],1]), theta[itemdf.uidx,1:])) gamma[mi,:] = gamma_m_draw(X_m, Y_m, sig2, Lambda_gamma) #For each user for ui,userdf in ugroups: #Gather relevant observations, and subtract out overall mean and #business-specific biases, which we are holding fixed. Y_u = userdf.stars-mu-gamma[userdf.midx,0] #Build the regression design matrix implied by holding business factors #fixed. X_u = np.hstack((np.ones([userdf.shape[0],1]), gamma[userdf.midx,1:])) theta[ui,:] = theta_u_draw(X_u, Y_u, sig2, Lambda_theta) #Record draws mu_draws[i] = mu sig2_draws[i] = sig2 theta_draws[:,:,i] = theta gamma_draws[:,:,i] = gamma return {"mu": mu_draws, "sig2": sig2_draws, "theta": theta_draws, "gamma": gamma_draws, "EY": EY_draws} #your code here gibbs_out = factor_gibbs(smalldf, 2, 1000, 0.1, 0.1) burnin = 200 predicted=np.mean(gibbs_out['EY'][:,burnin:], axis=1) #your code here compare_results(smalldf.stars.values, predicted, ylow=1, yhigh=5, title="From Gibbs Sampler") gibbs_out = factor_gibbs(smalldf, 15, 1000, 0.1, 0.1) burnin = 200 predicted=np.mean(gibbs_out['EY'][:,burnin:], axis=1) compare_results(smalldf.stars.values, predicted, ylow=1, yhigh=5, title="From Gibbs Sampler") subsetoffull=fulldf[['user_id','business_id', 'stars','business_avg','user_avg']] subsetoffull.to_csv("subset-full.csv", index=False, header=False) subsetofsmall=smalldf[['user_id','business_id', 'stars','business_avg','user_avg']] subsetofsmall.to_csv("subset-small.csv", index=False, header=False) from pygments import highlight from pygments.lexers import PythonLexer from pygments.formatters import HtmlFormatter from IPython.display import HTML import urllib skelcode = urllib.urlopen("https://raw.github.com/cs109/content/master/skeleton.py").read() skelhtml=highlight(skelcode, PythonLexer(), HtmlFormatter()) HTML(skelhtml) def upper_generator(words): for word in words: yield word.upper() words = ['a', 'couple', 'of', 'words', 'to', 'process'] print upper_generator(words) print list(upper_generator(words)) for u in upper_generator(words): print u thecode = open("computesim.py").read() thehtml=highlight(thecode, PythonLexer(), HtmlFormatter()) HTML(thehtml) output_small_local=[[json.loads(j) for j in line.strip().split("\t")] for line in open("./output.small.local.txt")] output_small_local[0] def make_database_from_pairs(df, bizpairs): """ make the database from the pairs returned from mrjob. df is the dataframe, smalldf or fulldf. bizpairs are a list of elements, each of which is a list of two lists. The first of these lists has the two business id's, while the second has the similarity and the common support Returns an instance of the Database class. """ dbase=Database(df) cache={} for bp,corrs in bizpairs: b1,b2=bp i1=dbase.uniquebizids[b1] i2=dbase.uniquebizids[b2] sim,nsup=corrs dbase.database_sim[i1][i2]=sim dbase.database_sim[i2][i1]=sim dbase.database_sup[i1][i2]=nsup dbase.database_sup[i2][i1]=nsup if cache.has_key(b1): nsup1=cache[b1] else: nsup1=dbase.df[dbase.df.business_id==b1].user_id.count() cache[b1]=nsup1 if cache.has_key(b2): nsup2=cache[b2] else: nsup2=dbase.df[dbase.df.business_id==b2].user_id.count() cache[b2]=nsup2 dbase.database_sim[i1][i1]=1.0 dbase.database_sim[i2][i2]=1.0 dbase.database_sup[i1][i1]=nsup1 dbase.database_sup[i2][i2]=nsup2 return dbase db_mrjob_local=make_database_from_pairs(smalldf, output_small_local) print db.get("zruUQvFySeXyEd7_rQixBg", "z3yFuLVrmH-3RJruPEMYKw") print db_mrjob_local.get("zruUQvFySeXyEd7_rQixBg", "z3yFuLVrmH-3RJruPEMYKw") sums=0. count=0 for k in db.uniquebizids.keys(): for k2 in db.uniquebizids.keys(): count=count+1 sums=sums+db.get(k,k2)[0]-db_mrjob_local.get(k,k2)[0] print sums, count output_full_emr=[[json.loads(j) for j in l.strip().split("\t")] for l in open("./output.full.emr.txt")] dbfull=make_database_from_pairs(fulldf, output_full_emr) #your code here print "for user",usernamefromid(fulldf,testuserid), 'avg', fulldf[fulldf.user_id==testuserid].stars.mean() for i in bizs: print "=========" print biznamefromid(fulldf, i), i print rating(fulldf, dbfull, i, testuserid, k=7, reg=3.) u,a=get_other_ratings(i, testuserid, fulldf) print "User Score:",u,"Avg score",a thecode = open("computesim2.py").read() thehtml=highlight(thecode, PythonLexer(), HtmlFormatter()) HTML(thehtml)