#!/usr/bin/env python # coding: utf-8 # ### Author: [Pratik Sharma](https://github.com/sharmapratik88/) # ## Project 6 - Recommendation System # Amazon Reviews data [source](http://jmcauley.ucsd.edu/data/amazon/). The repository has several datasets. For this case study, we are using the Electronics dataset. # # **Domain**: E-Commerce # # **Context**: Online E-commerce websites like Amazon, Flipkart uses different recommendation models to provide different suggestions to different users. Amazon currently uses item-to-item collaborative filtering, which scales to massive data sets and produces high-quality recommendations in real-time. # # **Attribute Information** # * `UserID`: Every user identified with a unique id. # * `ProductID`: Every product identified with a unique id. # * `Rating`: Rating of the corresponding product by the corresponding user. # * `timestamp`: Time of the rating. # # **Learning Outcomes** # * Exploratory Data Analysis # * Creating a recommendation system using real data # * Collaborative filtering # # **Objective**: Build a recommendation system to recommend products to customers based on their previous ratings for other products. # In[1]: # Mounting Google Drive from google.colab import drive drive.mount('/content/drive') # In[ ]: # Setting the current working directory import os; os.chdir('drive/My Drive/Great Learning/Recommendation System') # In[3]: get_ipython().system("ls '/content/drive/My Drive/Great Learning/Recommendation System'") # # ### Import Packages # In[4]: get_ipython().system('pip install scikit-surprise') # In[ ]: # Imports import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns import matplotlib.style as style; style.use('fivethirtyeight') from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from scipy.sparse.linalg import svds import gc get_ipython().run_line_magic('matplotlib', 'inline') # Surprise package for making recommendation from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore from surprise.model_selection import GridSearchCV, cross_validate, KFold from surprise import accuracy, Reader, Dataset, dump # For Sklearn NearestNeighbor based recommendation from sklearn.metrics import pairwise_distances, mean_squared_error from scipy.spatial.distance import correlation, cosine from sklearn.neighbors import NearestNeighbors from scipy.sparse import csr_matrix import sklearn.metrics as metrics from math import sqrt # Display settings pd.options.display.max_rows = 999 pd.options.display.max_columns = 20 pd.options.display.float_format = "{:.2f}".format random_state = 2019 np.random.seed(random_state) # Suppress warnings import warnings; warnings.filterwarnings('ignore') # # ### Read and explore the dataset # In[6]: # Reading the data as dataframe and print the first five rows ratings = pd.read_csv('ratings_Electronics.csv', header = None) ratings.columns = ['UserID', 'ProductID', 'Rating', 'Timestamp'] ratings.head() # In[7]: # Get info of the dataframe columns print('Get info of the dataframe columns'); print('--'*40) ratings.info() # In[8]: # Check if there any null values in the dataframe print('There are no null values in the dataset'); print('--'*40) ratings.isnull().sum() # In[9]: # Check if there are any duplicate rows print('Dataset has no duplicate rows'); print('--'*40) ratings[ratings.duplicated(keep = 'first')] # In[10]: # Checking the uniques in `Rating` column print('Checking the uniques in Rating column'); print('--'*40) sorted(list(ratings['Rating'].unique())) # # #### Observation 1 - Dataset shape # Dataset has more than 7.8 million reviews and with information regarding user id, product id, rating and timestamp. There are no missing values and duplicates in the dataset. Ratings are on the scale of 1-5. # # #### Observation 2 - Information on the type of variable # # * `userID`: Every user identified with a unique id (Categorical, Nominal). # * `productID`: Every product identified with a unique id (Categorical, Nominal). # * `Rating`: Rating of the corresponding product by the corresponding user (Numerical, Discrete). # * `Timestamp`: Time of the rating (Timestamp). # In[11]: ### Five point summary of numerical attributes and check unique values in 'object' columns print('Five point summary of the dataframe'); print('--'*40) ratings.describe(include = 'all') # In[12]: display(sorted(list(ratings['ProductID'].unique()))[0:5], sorted(list(ratings['ProductID'].unique()))[-5:]) # # #### Observation 3 - Descriptive statistics # * **`UserID`**: Categorical column with alphanumeric user id. Number of users in the dataset: 4201696. # * **`ProductID`**: Categorical column with some of the product ids being numerical entries and some being alphanumerics. Numbers of rated products: 476002. # * **`Rating`**: Users have rated the products on the scale of 1 to 5. # * **`Timestamp`**: Can be useful if we convert the numerical timestamp to datetime. # In[13]: fig = plt.figure(figsize = (15, 7.2)) ax = fig.add_subplot(121) g = sns.distplot(ratings['Rating'], ax = ax).set_title('Distribution of Ratings') ax = fig.add_subplot(122) g = sns.countplot(ratings['Rating']).set_title('Count of Ratings') # In[ ]: ratings['Timestamp'] = pd.to_datetime(ratings['Timestamp'], unit = 's') ratings['Year'] = ratings['Timestamp'].dt.year # In[15]: print('Trend of ratings over the years'); print('--'*40) ratings_over_years = ratings.groupby(by = 'Year', as_index = False)['Rating'].count() fig = plt.figure(figsize = (15, 7.2)) g = sns.lineplot(x = 'Year', y = 'Rating', data = ratings_over_years).set_title('Trend of Ratings over the Years') del g, ratings_over_years # In[16]: #http://jonathansoma.com/lede/data-studio/classes/small-multiples/long-explanation-of-using-plt-subplots-to-create-small-multiples/ print('Yearwise Counts for Ratings. Trend is similar across rating category.'); print('Most of the users have rated 5 on products and highest number of ratings came in 2013.'); print('--'*40) year_wise_ratings = pd.DataFrame(ratings.groupby(['Rating', 'Year'], as_index = False)['UserID'].count()) year_wise_ratings.rename(columns = {'UserID': 'Counts'}, inplace = True) ratings_ = sorted(year_wise_ratings['Rating'].unique()) fig, axes = plt.subplots(nrows = 2, ncols = 3, squeeze = False, figsize = (15, 7.2)) plt.subplots_adjust(hspace = 0.5) axes_list = [item for sublist in axes for item in sublist] for rating in ratings_: ax = axes_list.pop(0) g = year_wise_ratings[year_wise_ratings['Rating'] == rating].plot(kind = 'bar', x ='Year', y = 'Counts', label = f'Rating = {rating}', ax = ax, legend = True) ax.set_title(f'Yearwise Count for Rating = {rating}') for ax in axes_list: ax.remove() del ax, axes, axes_list, fig, rating, ratings_, year_wise_ratings # In[17]: print('Adding a column with count of rating per user'); print('--'*40) userid = ratings['UserID'].value_counts() userid = pd.DataFrame(userid).reset_index() userid.columns = ['UserID', 'UserIDCounts'] ratings_df = ratings.merge(userid, how = 'left', on = ['UserID']) display(ratings_df.shape, ratings_df.head()) del userid # In[18]: # Number of unique user id and product id in the data print('Number of unique USERS and PRODUCT IDs in the raw ratings dataframe'); print('--'*40) print('Number of unique USERS in raw ratings dataframe = ', ratings_df['UserID'].nunique()) print('Number of unique PRODUCTS in raw ratings dataframe = ', ratings_df['ProductID'].nunique()) # In[19]: print('Distribution of Ratings per User is sparser') print('Maximum number of rating per user being {maxm} and minimum being {minm}'.format(maxm = ratings_df['UserIDCounts'].max(), minm = ratings_df['UserIDCounts'].min())) print('--'*40) fig = plt.figure(figsize = (15, 7.2)) g = sns.distplot(ratings_df['UserIDCounts'], bins = 50).set_title('Distribution of Ratings per User') del fig, g # In[20]: print('Taking a subset of dataset to make it less sparse/denser') print('Keeping users those who have given more than 49 number of ratings'); print('--'*40) ratings_df = ratings_df[ratings_df['UserIDCounts'] >= 50] print('Number of rows after filtering: {}'.format(ratings_df.shape[0])) # In[21]: fig = plt.figure(figsize = (15, 7.2)) g = sns.distplot(ratings_df['UserIDCounts'], bins = 50).set_title('Distribution of Ratings per User after filtering users with less than 50 ratings') del fig, g # In[22]: print('Number of product ids after filtering based on ratings given by users: {}'.format(ratings_df['ProductID'].nunique())) # In[23]: print('Selecting only UserID, ProductID and \'Rating\' column'); print('--'*40) ratings = ratings_df[['UserID', 'ProductID', 'Rating']] # In[24]: # Number of unique user id and product id in the data print('Number of unique USERS and PRODUCT IDs in the filtered ratings dataframe'); print('--'*40) print('Number of unique USERS in filtered ratings dataframe = ', ratings['UserID'].nunique()) print('Number of unique PRODUCTS in filtered ratings dataframe = ', ratings['ProductID'].nunique()) # In[25]: # Top and bottom 10 users based on # of ratings given print('Top 10 users based on # of ratings given'); print('--'*40) most_rated = ratings.groupby('UserID').size().sort_values(ascending = False)[:10] display(most_rated) print('\nBottom 10 users based on # of ratings given'); print('--'*40) least_rated = ratings.groupby('UserID').size().sort_values(ascending = True)[:10] display(least_rated) del most_rated, least_rated # # ### Recommenders # We will explore following methods of making recommendations: # * Popularity based recommendations # * Collaborative filtering (User-based and Item-based recommendations) # In[26]: train_data, test_data = train_test_split(ratings, test_size = 0.30, random_state = random_state) display(train_data.shape, test_data.shape) # In[27]: print('Number of unique users in training dataframe {}'.format(train_data['UserID'].nunique())) print('Number of unique users in test dataframe: {}'.format(test_data['UserID'].nunique())) print('Number of products that aren\'t present in test dataframe: '.format(len(list(set(list(train_data['ProductID'].unique())) - set(list(test_data['ProductID'].unique())))))) # In[28]: print('Number of unique products in training dataframe {}'.format(train_data['ProductID'].nunique())) print('Number of unique products in test dataframe: {}'.format(test_data['ProductID'].nunique())) print('Number of products that aren\'t present in test dataframe: {}'.format(len(list(set(list(train_data['ProductID'].unique())) - set(list(test_data['ProductID'].unique())))))) # # #### **Popularity based recommendations** # * Create a class to make recommendation using popularity based method. # * Get top 5 recommendations for couple of users, recommendations are based on the Rating means for the Product IDs. However will later explore other methods as well. # * Comment on the findings. # In[ ]: #Class for Popularity based Recommender System class popularity_recommender(): def __init__(self): self.trainSet = None self.userId = None self.productId = None self.popularityRecommendations = None self.topN = None def create(self, trainSet, userId, productId, topN): self.trainSet = trainSet self.userId = userId self.productId = productId self.topN = topN byRating = self.trainSet.groupby('ProductID', sort = False, as_index = False)['Rating'].mean().sort_values(by = 'Rating', ascending = False) byRating['RatingRank'] = byRating['Rating'].rank(ascending = False, method = 'first') byUsers = self.trainSet.groupby('ProductID', sort = False, as_index = False)['Rating'].count().sort_values(by = 'Rating', ascending = False) byUsers.columns = ['ProductID', 'RatingCount'] byRatingUsers = pd.merge(byRating, byUsers, on = 'ProductID', how = 'left') byRatingUsers = byRatingUsers.sort_values(by = 'RatingRank', ascending = False) self.popularity_recommendations = byRating.head(self.topN) return byRatingUsers def recommend(self, user_id): user_recommendations = self.popularity_recommendations user_recommendations['UserID'] = user_id cols = user_recommendations.columns.tolist() cols = cols[-1:] + cols[:-1] user_recommendations = user_recommendations[cols] try: print('User has already rated products (from data in training set): {}'.format(self.trainSet.loc[(self.trainSet['UserID'] == user_id), 'ProductID'].nunique())) print('Top 5 products from what\'s already being rated: {}'.format(list(self.trainSet[(self.trainSet['UserID'] == user_id)].sort_values(by = 'Rating', ascending = False).head(5)['ProductID']))) except: print('There\'s no data for the selected user in training set') print('\nTop 5 recommendations for the user based on popularity based method: {}'.format(list(user_recommendations['ProductID']))) return list(user_recommendations['ProductID']) # In[30]: # Get list of unique user and product ids in testset print('Get list of unique user and product ids in testset'); print('--'*40) test_userids = sorted(list(test_data['UserID'].unique())) test_productids = sorted(list(test_data['ProductID'].unique())) # In[31]: # Get top 5 recommendations print('Popularity recommendation is based on the mean of Ratings received and not Rating counts, later we will explore other methods as well.') print('Get top - K ( K = 5) recommendations.') print('Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.'); print('--'*40) compare_dict = {}; result = {} popularity = popularity_recommender() byRatingUsers = popularity.create(train_data, 'UserID', 'ProductID', 5) print('\nMake recommendation for the user id selected from the testset = "A11D1KHM7DVOQK"') user_id = "A11D1KHM7DVOQK" result[user_id] = popularity.recommend(user_id) print('\n\nMake recommendation for the user id selected from the testset = "A149RNR5RH19YY"'); print('--'*40) user_id = "A149RNR5RH19YY" result[user_id] = popularity.recommend(user_id) # In[32]: print('Store the recommendations in a dictionary'); print('--'*40) compare_dict['PopularityRec'] = result # In[33]: print('Evaluating Popularity based Recommender') print('Creating a new dataframe with mean rating for each product in test dataframe and using our prediction dataframe i.e. byRatingUsers to calculate RMSE'); print('--'*40) test_means = test_data.groupby('ProductID', sort = False, as_index = False)['Rating'].mean().sort_values(by = 'Rating', ascending = False) test_means = test_means.merge(byRatingUsers, on = 'ProductID', how = 'left', suffixes=('_act', '_pred')).drop(['RatingRank', 'RatingCount'], axis = 1).fillna(0) print('Shape of test mean dataframe: {}'.format(test_means.shape)) print('Shape of predicted (recommender) dataframe: {}'.format(byRatingUsers.shape)) RMSE_pop = sqrt(mean_squared_error(test_means['Rating_act'], test_means['Rating_pred'])) print('--' * 40) print('RMSE OF THE POPULARITY BASED RECOMMENDER: {}'.format(round(RMSE_pop, 4))) # In[34]: print('Recommendations based on mean of Rating, which is the method used above'); print('--'*40) display(byRatingUsers.sort_values(by = 'RatingRank', ascending = True).head(5)['ProductID'].tolist()) print('\nRecommendations based on count of Rating'); print('--'*40) display(byRatingUsers.sort_values(by = 'RatingCount', ascending = False).head(5)['ProductID'].tolist()) print('\nRecommendations based on a mix of mean and count of Rating'); print('--'*40) display(byRatingUsers.sort_values(by = ['Rating', 'RatingCount'], ascending = False).head(5)['ProductID'].tolist()) # In[35]: print('Plot of average ratings versus number of ratings'); print('--'*40) g = sns.jointplot(x = 'Rating', y = 'RatingCount', data = byRatingUsers, alpha = 0.4, height = 10) del g, byRatingUsers, popularity_recommender, user_id # # ##### Observation 4 - Popularity Based Recommendation # * For popularity recommendation system, we recommended products based on *mean of Ratings* given by users. We saw that the top 5 products which we recommended to users are those where only 1 user from the training set has rated. # * Then we also explored other methods for popularity recommendations. Those were based on: # * *Count of Ratings* received for the product # * *Hybrid method* for popularity recommendation where in we used both mean and count of rating to decide on the product recommended # * For all of the above cases (recommendations based on mean, count, and mean and count), popularity based method lacks personalization i.e. same recommendations for all users. However, using Popularity based recommendation system it would easier to recommend products to a new user w/o having knowledge about who the users are or what their preferences are and recommending them the products that are in-trend. # * **RMSE of the popularity based recommendation method using mean of rating is 3.0894.** # # #### **Collaborative Filtering** # Objective is to build a recommendation system to recommend products to customers based on their previous ratings for other products i.e. item-based collaborative filtering. # # **"You tend to like that item because you've liked those items."** # # whereas as we know that in user-based it's "You may like it because your friends liked it". # # * Model-based Collaborative Filtering: Singular Value Decomposition and evaluate k-NN based algos. # * Use the filtered ratings dataframe and scipy based SVD to evaluate Item-based collaborative filtering method for suggesting products based to users based on what he has liked in past. # * Also explore user based collaborative filtering. # # * Comment on the findings. # # # ##### **Model based Collaborative Filtering: SVD** # In[36]: # Item-based Collaborative Filtering print('Matrix with one row per \'Product\' and one column per \'User\' for Item-based collaborative filtering'); print('--'*40) ratings_item = ratings.pivot(index = 'UserID', columns = 'ProductID', values = 'Rating').fillna(0) ratings_item.head() # In[37]: # Calculate the density of the rating matrix print('Calculate the density of the ratings matrix'); print('--'*40) print('Shape of ratings matrix: ', ratings_item.shape) given_num_of_ratings = np.count_nonzero(ratings_item) print('given_num_of_ratings = ', given_num_of_ratings) possible_num_of_ratings = ratings_item.shape[0] * ratings_item.shape[1] print('possible_num_of_ratings = ', possible_num_of_ratings) density = (given_num_of_ratings/possible_num_of_ratings) density *= 100 print ('density: {:4.2f}%'.format(density)) # In[38]: # Singular Value Decomposition U, sigma, Vt = svds(ratings_item, k = 10) sigma = np.diag(sigma) all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) preds_df = pd.DataFrame(all_user_predicted_ratings, columns = ratings_item.columns, index = ratings_item.index) #predicted ratings preds_df.head() # In[39]: # Recommend products with highest predicted ratings print('Creating a function to recommend products with highest predicted ratings'); print('--'*40) def recommend_items(user_id, ratings_item, preds_df, num_recommendations = 5): try: print('User has already rated products (from data in training set): {}'.format(train_data.loc[(train_data['UserID'] == user_id), 'ProductID'].nunique())) print('Top 5 products from what\'s already being rated: {}'.format(list(train_data[(train_data['UserID'] == user_id)].sort_values(by = 'Rating', ascending = False).head(5)['ProductID']))) except: print('There\'s no data for the selected user in training set') sorted_user_ratings = ratings_item.loc[user_id].sort_values(ascending = False) sorted_user_predictions = preds_df.loc[user_id].sort_values(ascending = False) temp = pd.concat([sorted_user_ratings, sorted_user_predictions], axis = 1) temp.index.name = 'Recommended Items' temp.columns = ['user_ratings', 'user_predictions'] temp = temp.loc[temp.user_ratings == 0] temp = temp.sort_values('user_predictions', ascending = False) print('\nTop 5 recommendations for the user based on item-based collaborative filtering method') display(temp.head(num_recommendations)) return temp.head(num_recommendations).index.tolist() # In[40]: print('Get top - K ( K = 5) recommendations.') print('Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.'); print('--'*40) result = {} user_id = "A11D1KHM7DVOQK" print(f'\nMake recommendation for the user id selected from the testset = "{user_id}"') result[user_id] = recommend_items(user_id, ratings_item, preds_df) user_id = "A149RNR5RH19YY" print(f'\n\nMake recommendation for the user id selected from the testset = "{user_id}"') result[user_id] = recommend_items(user_id, ratings_item, preds_df) # In[ ]: compare_dict['SVD Item-based Collaborative Filtering'] = result # In[42]: print('Evaluating SVD for Item-based Collaborative Filtering'); print('--'*60) rmse_df = pd.concat([ratings_item.mean(), preds_df.mean()], axis = 1) rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings'] RMSE = round((((rmse_df['Avg_actual_ratings'] - rmse_df['Avg_predicted_ratings']) ** 2).mean() ** 0.5), 4) print('RMSE OF ITEM BASED COLLABORATIVE FILTERING USING MATRIX FACTORIZATION METHOD (SVD): {}'.format(RMSE)) # # ###### Observation 5 - Item Based Collaborative Filtering -- SVD # * Above we evaluated SVD for item-based collaborative filtering and it can be seen that the RMSE of SVD model 0.0033. # In[ ]: del (RMSE, U, sigma, Vt, all_user_predicted_ratings, given_num_of_ratings, possible_num_of_ratings, result, rmse_df, density, preds_df, recommend_items, user_id) # # ##### **Product Similarity based on Sklearn Nearest Neighbor** # In[44]: print('Product similarity based on Sklearn Nearest Neighbor'); print('--'*40) k = 5 df_knn = ratings.pivot(index = 'ProductID', columns = 'UserID', values = 'Rating').fillna(0) df_knn_matrix = csr_matrix(df_knn.values) model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = k) model_knn.fit(df_knn_matrix) query_item = np.random.choice(df_knn.shape[0]) query_user = np.random.choice(df_knn.shape[1]) distances, indices = model_knn.kneighbors(df_knn.iloc[query_item, :].values.reshape(1, -1), n_neighbors = k+1) for i in range(0, len(distances.flatten())): if i == 0: print('Recommendations for {0}:\n'.format(df_knn.index[query_item])) else: print('{0}: {1}, with distance of {2}:'.format(i, df_knn.index[indices.flatten()[i]], distances.flatten()[i])) # # ##### **Model based Collaborative Filtering: k-NN** # In[45]: print('Further reducing the number of users'); print('Earlier we had considered users those who rated >=50 products, now to avoid memory issues let\'s take users those who\'ve rated >100 products') print('--'*40) ratings_df = ratings_df[ratings_df['UserIDCounts'] > 100] print(f'Number of rows {ratings_df.shape[0]} and number of columns {ratings_df.shape[1]} in filtered dataframe') print('Number of unique USERS in further filtered ratings dataframe = ', ratings_df['UserID'].nunique()) print('Number of unique PRODUCTS in further filteredratings dataframe = ', ratings_df['ProductID'].nunique()) ratings = ratings_df[['UserID', 'ProductID', 'Rating']] # In[46]: ratings['UserID'].value_counts().min() # In[47]: train_data, test_data = train_test_split(ratings, test_size = 0.30, random_state = random_state) display(train_data.shape, test_data.shape) # In[48]: print('Getting the trainset and testset ready for recommender to be used'); print('--'*40) reader = Reader(rating_scale = (0, 5)) data = Dataset.load_from_df(ratings[['UserID', 'ProductID', 'Rating']], reader) trainset = Dataset.load_from_df(train_data[['UserID', 'ProductID', 'Rating']], reader); testset = Dataset.load_from_df(test_data[['UserID', 'ProductID', 'Rating']], reader); # In[49]: get_ipython().run_cell_magic('time', '', "print('ITEM BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGOS')\nprint('Grid Search across parameter grid to find best parameters using KNNBasic algorithm'); print('--'*40)\nparam_grid_KNNBasic = {'k': [3, 5, 10], 'sim_options': {'name': ['pearson_baseline', 'cosine'], 'user_based': [False]}, 'verbose': [False]}\n\ngs_KNNBasic = GridSearchCV(KNNBasic, param_grid_KNNBasic, measures = ['rmse', 'mae'], cv = 3)\ngs_KNNBasic.fit(trainset)\nprint(gs_KNNBasic.best_score['rmse'])\nprint(gs_KNNBasic.best_params['rmse'])\n") # In[50]: get_ipython().run_cell_magic('time', '', "print('ITEM BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGOS')\nprint('Grid Search across parameter grid to find best parameters using KNNWithMeans algorithm'); print('--'*40)\nparam_grid_KNNWithMeans = {'k': [3, 5, 10], 'sim_options': {'name': ['pearson_baseline', 'cosine'], 'user_based': [False]}, 'verbose': [False]}\n\ngs_KNNWithMeans = GridSearchCV(KNNWithMeans, param_grid_KNNWithMeans, measures = ['rmse', 'mae'], cv = 3)\ngs_KNNWithMeans.fit(trainset)\nprint(gs_KNNWithMeans.best_score['rmse'])\nprint(gs_KNNWithMeans.best_params['rmse'])\n") # In[51]: get_ipython().run_cell_magic('time', '', "print('ITEM BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGOS')\nprint('Grid Search across parameter grid to find best parameters using KNNWithZScore algorithm'); print('--'*40)\nparam_grid_KNNWithZScore = {'k': [3, 5, 10], 'sim_options': {'name': ['pearson_baseline', 'cosine'], 'user_based': [False]}, 'verbose': [False]}\n\ngs_KNNWithZScore = GridSearchCV(KNNWithZScore, param_grid_KNNWithZScore, measures = ['rmse', 'mae'], cv = 3)\ngs_KNNWithZScore.fit(trainset)\nprint(gs_KNNWithZScore.best_score['rmse'])\nprint(gs_KNNWithZScore.best_params['rmse'])\n") # In[52]: get_ipython().run_cell_magic('time', '', "print('ITEM BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGOS')\nprint('Grid Search across parameter grid to find best parameters using KNNBaseline algorithm'); print('--'*40)\nparam_grid_KNNBaseline = {'bsl_options': {'method': ['als', 'sgd'], 'reg': [1, 2]}, 'k': [2, 3, 5], \n 'sim_options': {'name': ['pearson_baseline', 'cosine'], 'user_based': [False]},\n 'verbose': [False]}\n\ngs_KNNBaseline = GridSearchCV(KNNBaseline, param_grid_KNNBaseline, measures = ['rmse', 'mae'], cv = 3)\ngs_KNNBaseline.fit(trainset)\nprint(gs_KNNBaseline.best_score['rmse'])\nprint(gs_KNNBaseline.best_params['rmse'])\n") # In[53]: del param_grid_KNNBasic, param_grid_KNNWithMeans, param_grid_KNNWithZScore, gs_KNNBasic, gs_KNNWithMeans, gs_KNNWithZScore gc.collect() # # ###### Observation 6 - Algorithm choosen for Model based (Item) Collaborative Filtering using k-NN inspired method # * Above we evaluated different k-NN inspired algorithms for item-based collaborative filtering. It can be seen that KNNBaseline algorithm gives the lowest RMSE of ~0.966. # * k-fold cross-validate KNNBaseline algorithm using the best parameters, to see if there's any improvement in the RMSE. # In[54]: get_ipython().run_cell_magic('time', '', "print('ITEM BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGOS')\nprint('2-Fold cross validation using KNNBaseline and with best parameters identified during grid search'); print('--'*40)\nkf = KFold(n_splits = 2)\nalgo = KNNBaseline(**gs_KNNBaseline.best_params['rmse'])\nrmse_scores = []\n\nfor train_, test_ in kf.split(data):\n algo.fit(train_)\n predictions = algo.test(test_)\n rmse = round(accuracy.rmse(predictions, verbose = True), 4)\n rmse_scores.append(rmse)\n\n dump.dump('./dump_KNNBaseline_Item', predictions, algo)\n\nprint('--'*40)\nprint(f'RMSE OF ITEM BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGORITHM AND 2-FOLD CROSS VALIDATION {round(np.mean(rmse_scores), 4)}')\n") # In[55]: predictions, algo = dump.load('./dump_KNNBaseline_Item') df_user = pd.DataFrame(predictions, columns = ['UserID', 'ProductID', 'ActualRating', 'EstRating', 'Details']) df_user['Error'] = abs(df_user['EstRating'] - df_user['ActualRating']) df_user.sort_values('Error', inplace = True, ascending = True) display(df_user.head()) # In[56]: # Actual vs Prediction Comparison print('Actual vs Prediction Comparison'); print('--'*40) fig, ax = plt.subplots(figsize = (15, 7.2)) fig.suptitle('Actual vs Prediction Comparison', fontsize = 14) df_user['EstRating'].plot.hist(bins = 25, alpha = 0.8) df_user['ActualRating'].plot.hist(bins = 25, alpha = 0.8) ax.legend(['Predictions', 'Actual']) plt.show() # In[57]: # Query top 5 recommendations for specific UserID print('Get top - K ( K = 5) recommendations.') print('Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.'); print('--'*40) result = {} def query_user(user_id): try: print('User "{}" has already rated products (from data in training set): {}'.format(user_id, train_data.loc[(train_data['UserID'] == user_id), 'ProductID'].nunique())) print('Top 5 products from what\'s already being rated: {}'.format(list(train_data[(train_data['UserID'] == user_id)].sort_values(by = 'Rating', ascending = False).head(5)['ProductID']))) except: print('There\'s no data for the selected user in training set') print('Top 5 recommendations for the user are: {}'.format(list(df_user[(df_user['UserID'] == user_id)].sort_values(by = 'EstRating', ascending = False).head(5)['ProductID']))) return list(df_user[(df_user['UserID'] == user_id)].sort_values(by = 'EstRating', ascending = False).head(5)['ProductID']) # For e.g. querying for the following user print('A check on what has the user liked in past (based on data available in training set, if there is) and making recommendations'); print('--'*40, '\n') result['A11D1KHM7DVOQK'] = query_user('A11D1KHM7DVOQK') print('\n') result['A149RNR5RH19YY'] = query_user('A149RNR5RH19YY') # In[58]: compare_dict['k-NN Item-based Collaborative Filtering'] = result display(compare_dict) # In[59]: df_user.head() # In[ ]: del (algo, ax, fig, gs_KNNBaseline, kf, param_grid_KNNBaseline, predictions, rmse, rmse_scores, train_, test_) # # ###### Observation 7 - Item based Collaborative Filtering (k-NN) # * Using k-NN inspired algos for item based collaborative filtering and 2-Fold cross validation, we get a RMSE score of ~0.9655. # In[61]: get_ipython().run_cell_magic('time', '', "print('USER BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGOS')\nprint('Grid Search across parameter grid to find best parameters using KNNBaseline algorithm'); print('--'*40)\nparam_grid_KNNBaseline = {'bsl_options': {'method': ['als', 'sgd'], 'reg': [1, 2]}, 'k': [2, 3, 5], \n 'sim_options': {'name': ['pearson_baseline', 'cosine'], 'user_based': [True]},\n 'verbose': [False]}\n\ngs_KNNBaseline = GridSearchCV(KNNBaseline, param_grid_KNNBaseline, measures = ['rmse', 'mae'], cv = 3)\ngs_KNNBaseline.fit(trainset)\nprint(gs_KNNBaseline.best_score['rmse'])\nprint(gs_KNNBaseline.best_params['rmse'])\n") # In[62]: get_ipython().run_cell_magic('time', '', "print('USER BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGOS')\nprint('2-Fold cross validation using KNNBaseline and with best parameters identified during grid search'); print('--'*40)\nkf = KFold(n_splits = 2)\nalgo = KNNBaseline(**gs_KNNBaseline.best_params['rmse'])\nrmse_scores = []\n\nfor train_, test_ in kf.split(data):\n algo.fit(train_)\n predictions = algo.test(test_)\n rmse = round(accuracy.rmse(predictions, verbose = True), 4)\n rmse_scores.append(rmse)\n\n dump.dump('./dump_KNNBaseline_User', predictions, algo)\n\nprint('--'*40)\nprint(f'RMSE OF USER BASED COLLABORATIVE FILTERING USING k-NN INSPIRED ALGORITHM AND 2-FOLD CROSS VALIDATION {round(np.mean(rmse_scores), 4)}')\n") # In[63]: predictions, algo = dump.load('./dump_KNNBaseline_User') df_user = pd.DataFrame(predictions, columns = ['UserID', 'ProductID', 'ActualRating', 'EstRating', 'Details']) df_user['Error'] = abs(df_user['EstRating'] - df_user['ActualRating']) df_user.sort_values('Error', inplace = True, ascending = True) display(df_user.head()) # In[64]: print('A check on what has the user liked in past (based on data available in training set, if there is) and making recommendations'); print('--'*40, '\n') result = {} result['A11D1KHM7DVOQK'] = query_user('A11D1KHM7DVOQK') print('\n') result['A149RNR5RH19YY'] = query_user('A149RNR5RH19YY') # In[65]: compare_dict['k-NN User-based Collaborative Filtering'] = result display(compare_dict) # # ###### Observation 8 - User based Collaborative Filtering (k-NN) # * Using k-NN inspired algos for user based collaborative filtering and 2-Fold cross validation, we get a RMSE score of ~0.9826. # # ### Conclusion # * Non-personalized based recommendation system (such as popularity) is generated by averaging the recommendations for all the users. Here we recommended top 5 products to the users. Also saw how we can make use of count to suggest popular products to the users and hybrid popularity based recommender based on a combination of mean and count. However in popularity based recommendation, all users receive same recommendations. RMSE of popularity recommendation method based on mean of ratings was 3.0894. # # * Collaborative-based recommendations are personalized since the rating "prediction" differs depending on the target user and it is based on # * User-to-user: ratings for a given product expressed by users that are similar to the active user. # * Item-to-item: weighted average of the ratings of the active users for the similar items. # # * Collaborative based filtering method requires a minimal knowledge engineering efforts when compared to methods such as content-based recsys. This method is based on user history, but what if the user is new (where there is no user history)? It's one of the limitations of the method known as cold-start problem. # # * Items with lots of history gets recommended a lot, while those without never make it into the recommendation engine. # # * Additionally, collaborative based filtering methods face scalability issues particularly in our case where the number of users (4,201,696) and items (476,002) were high (sparse data), especially when recommendations need to be generated in real-time online. To overcome this, we filtered users who have rated at least 50 products, this left about 1,540 number of users and 48,190 products in the dataframe and these were further reduced to select only those users with > 100 ratings to avoid memory issues while using k-NN inspired algorithms. # # * Since our goal was to build a recsys to recommend products to customers based on their previous ratings for other products, we built an item-based collaborative filtering recommendation system. Used two model-based approaches to do that: SVD and k-NN inspired algos. # # * We saw that SVD had a RMSE score of 0.0033. We also compared various k-NN based algorithms using grid search method and found that KNN Baseline algo gave the lowest RMSE, we then used 2-fold cross validation technique which gave a RMSE of 0.9655. # # * Also explored kNN Baseline algo for user-based collaborative filtering, RMSE (0.9826) was slightly higher than item-based CF.