In [2]:
import numpy as np
import surprise  # run 'pip install scikit-surprise' to install surprise

In [3]:
class MatrixFacto(surprise.AlgoBase):
'''A basic rating prediction algorithm based on matrix factorization.'''

def __init__(self, learning_rate, n_epochs, n_factors):

self.lr = learning_rate  # learning rate for SGD
self.n_epochs = n_epochs  # number of iterations of SGD
self.n_factors = n_factors  # number of factors

def fit(self, trainset):
'''Learn the vectors p_u and q_i with SGD'''

print('Fitting data with SGD...')

# Randomly initialize the user and item factors.
p = np.random.normal(0, .1, (trainset.n_users, self.n_factors))
q = np.random.normal(0, .1, (trainset.n_items, self.n_factors))

# SGD procedure
for _ in range(self.n_epochs):
for u, i, r_ui in trainset.all_ratings():
err = r_ui - np.dot(p[u], q[i])
# Update vectors p_u and q_i
p[u] += self.lr * err * q[i]
q[i] += self.lr * err * p[u]
# Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.
# In practice it makes almost no difference.

self.p, self.q = p, q
self.trainset = trainset

def estimate(self, u, i):
'''Return the estmimated rating of user u for item i.'''

# return scalar product between p_u and q_i if user and item are known,
# else return the average of all ratings
if self.trainset.knows_user(u) and self.trainset.knows_item(i):
return np.dot(self.p[u], self.q[i])
else:
return self.trainset.global_mean

In [11]:
# data loading. We'll use the movielens dataset (https://grouplens.org/datasets/movielens/100k/)
data.split(2)  # split data for 2-folds cross validation

In [12]:
algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)
surprise.evaluate(algo, data, measures=['RMSE'])

Evaluating RMSE of algorithm MatrixFacto.

------------
Fold 1
Fitting data with SGD...
RMSE: 0.9826
------------
Fold 2
Fitting data with SGD...
RMSE: 0.9873
------------
------------
Mean RMSE: 0.9849
------------
------------

Out[12]:
CaseInsensitiveDefaultDict(list,
{'rmse': [0.98263312180825368, 0.9872549391926676]})
In [13]:
# try a neighborhood-based algorithm (on the same data)
algo = surprise.KNNBasic()
surprise.evaluate(algo, data, measures=['RMSE'])

Evaluating RMSE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0101
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9982
------------
------------
Mean RMSE: 1.0042
------------
------------

Out[13]:
CaseInsensitiveDefaultDict(list,
{'rmse': [1.0101383334175613, 0.99823558896449016]})
In [14]:
# try a more sophisticated matrix factorization algorithm (on the same data)
algo = surprise.SVD()
surprise.evaluate(algo, data, measures=['RMSE'])

Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 0.9604
------------
Fold 2
RMSE: 0.9538
------------
------------
Mean RMSE: 0.9571
------------
------------

Out[14]:
CaseInsensitiveDefaultDict(list,
{'rmse': [0.96042083843476056,
0.95382688332712151]})