#!/usr/bin/env python # coding: utf-8 # # Analysis of the KNNBasic algorithm # # In this notebook, we will run a basic neighborhood algorithm on the movielens dataset, dump the results, and use pandas to make some data analysis. # In[1]: from __future__ import (absolute_import, division, print_function, unicode_literals) import pickle import os import pandas as pd from surprise import KNNBasic from surprise import Dataset from surprise import Reader from surprise.model_selection import PredefinedKFold from surprise import dump from surprise.accuracy import rmse # In[2]: # We will train and test on the u1.base and u1.test files of the movielens-100k dataset. # if you haven't already, you need to download the movielens-100k dataset # You can do it manually, or by running: #Dataset.load_builtin('ml-100k') # Now, let's load the dataset train_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.base' test_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.test' data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() # We'll use a basic nearest neighbor approach, where similarities are computed # between users. algo = KNNBasic() for trainset, testset in pkf.split(data): algo.fit(trainset) predictions = algo.test(testset) rmse(predictions) dump.dump('./dump_file', predictions, algo) # In[3]: # The dump has been saved and we can now use it whenever we want. # Let's load it and see what we can do predictions, algo = dump.load('./dump_file') # In[4]: trainset = algo.trainset print('algo: {0}, k = {1}, min_k = {2}'.format(algo.__class__.__name__, algo.k, algo.min_k)) # In[5]: # Let's build a pandas dataframe with all the predictions def get_Iu(uid): """Return the number of items rated by given user Args: uid: The raw id of the user. Returns: The number of items rated by the user. """ try: return len(trainset.ur[trainset.to_inner_uid(uid)]) except ValueError: # user was not part of the trainset return 0 def get_Ui(iid): """Return the number of users that have rated given item Args: iid: The raw id of the item. Returns: The number of users that have rated the item. """ try: return len(trainset.ir[trainset.to_inner_iid(iid)]) except ValueError: # item was not part of the trainset return 0 df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details']) df['Iu'] = df.uid.apply(get_Iu) df['Ui'] = df.iid.apply(get_Ui) df['err'] = abs(df.est - df.rui) # In[6]: df.head() # In[7]: best_predictions = df.sort_values(by='err')[:10] worst_predictions = df.sort_values(by='err')[-10:] # In[8]: # Let's take a look at the best predictions of the algorithm best_predictions # It's interesting to note that these perfect predictions are actually lucky shots: $|U_i|$ is always very small, meaning that very few users have rated the target item. This implies that the set of neighbors is very small (see the ``actual_k`` field)... And, it just happens that all the ratings from the neighbors are the same (and mostly, are equal to that of the target user). # # This may be a bit surprising but these lucky shots are actually very important to the accuracy of the algorithm... Try running the same algorithm with a value of ``min_k`` equal to $10$. This means that if there are less than $10$ neighbors, the prediction is set to the mean of all ratings. You'll see your accuracy decrease! # In[9]: # Now, let's look at the prediction with the biggest error worst_predictions # Let's focus first on the last two predictions. Well, we can't do much about them. We should have predicted $5$, but the only available neighbor had a rating of $1$, so we were screwed. The only way to avoid this kind of errors would be to increase the ``min_k`` parameter, but it would actually worsen the accuracy (see note above). # # How about the other ones? It seems that for each prediction, the users are some kind of outsiders: they rated their item with a rating of $1$ when the most of the ratings for the item where high (or inversely, rated a *bad* item with a rating of $5$). See the plot below as an illustration for the first rating. # # These are situations where baseline estimates would be quite helpful, in order to deal with highly biased users (and items). # In[10]: from collections import Counter import matplotlib.pyplot as plt import matplotlib get_ipython().run_line_magic('matplotlib', 'notebook') matplotlib.style.use('ggplot') counter = Counter([r for (_, r) in trainset.ir[trainset.to_inner_iid('302')]]) pd.DataFrame.from_dict(counter, orient='index').plot(kind='bar', legend=False) plt.xlabel('Rating value') plt.ylabel('Number of users') plt.title('Number of users having rated item 302') # In[ ]: