#!/usr/bin/env python # coding: utf-8 # # Comparison of two algorithms # # We will see in this notebook how we can compare the prediction accuracy of two algorithms. # In[18]: from __future__ import (absolute_import, division, print_function, unicode_literals) import pickle import os import pandas as pd from surprise import SVD from surprise import KNNBasic from surprise import Dataset from surprise import Reader from surprise.model_selection import PredefinedKFold from surprise import dump from surprise.accuracy import rmse # In[ ]: # We will train and test on the u1.base and u1.test files of the movielens-100k dataset. # if you haven't already, you need to download the movielens-100k dataset # You can do it manually, or by running: # Dataset.load_builtin('ml-100k') # Now, let's load the dataset train_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.base' test_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.test' data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() # We'll use the well-known SVD algorithm and a basic nearest neighbors approach. algo_svd = SVD() algo_knn = KNNBasic() for trainset, testset in pkf.split(data): algo_svd.fit(trainset) predictions_svd = algo_svd.test(testset) algo_knn.fit(trainset) predictions_knn = algo_knn.test(testset) rmse(predictions_svd) rmse(predictions_knn) dump.dump('./dump_SVD', predictions_svd, algo_svd) dump.dump('./dump_KNN', predictions_knn, algo_knn) # In[10]: # The dumps have been saved and we can now use them whenever we want. predictions_svd, algo_svd = dump.load('./dump_SVD') predictions_knn, algo_knn = dump.load('./dump_KNN') df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details']) df_knn = pd.DataFrame(predictions_knn, columns=['uid', 'iid', 'rui', 'est', 'details']) df_svd['err'] = abs(df_svd.est - df_svd.rui) df_knn['err'] = abs(df_knn.est - df_knn.rui) # We now have two dataframes with the all the predictions for each algorithm. The cool thing is that, as both algorithm have been tested on the same testset, the indexes of the two dataframes are the same! # In[11]: df_svd.head() # In[12]: df_knn.head() # In[13]: # Let's check how good are the KNN predictions when the SVD has a huge error: df_knn[df_svd.err >= 3.5] # In[14]: # Well... Not much better. # Now, let's look at the predictions of SVD on the 10 worst predictions for KNN df_svd.iloc[df_knn.sort_values(by='err')[-10:].index] # In[15]: # How different are the predictions from both algorithms ? # Let's count the number of predictions for each rating value import matplotlib.pyplot as plt import matplotlib get_ipython().run_line_magic('matplotlib', 'notebook') matplotlib.style.use('ggplot') figure, (ax1, ax2) = plt.subplots(1, 2) df_svd.est.plot(kind='hist', title='SVD', ax=ax1) df_knn.est.plot(kind='hist', title='KNN', ax=ax2) # As expected, one of the drawbacks of the NN algorithms is that their predictions are often # quite concentrated around the mean. The SVD algorithm seems more confortable predicting extreme rating values. # In[17]: # Question: when a user has rated only a small number of items (less than 10), which algorithm # gives the best predictions on average? def get_Iu(uid): """Return the number of items rated by given user Args: uid: The raw id of the user. Returns: The number of items rated by the user. """ try: return len(trainset.ur[trainset.to_inner_uid(uid)]) except ValueError: # user was not part of the trainset return 0 df_knn['Iu'] = df_knn.uid.apply(get_Iu) df_svd['Iu'] = df_svd.uid.apply(get_Iu) df_knn[df_knn.Iu < 10].err.mean(), df_svd[df_svd.Iu < 10].err.mean() # In[ ]: