This notebook will show how to implement simple recommender system follwing two different approaches: Collaborative Filtering (user based) and Content Based recommendation.
DISCLAIMER: The used dataset is NOT a real dataset, but it has been artificially generated for the Tutorial purposes. It absolutely should NOT be used as training data for any application.
import pandas as pd
import numpy as np
import scipy.spatial.distance as distance
data = pd.read_csv('data.csv', index_col=0)
data.head()
Wolfgang Amadeus Mozart | Franz Liszt | Joseph Haydn | Johannes Brahms | Robert Schumann | Antonio Vivaldi | Roland de Lassus | Frédéric Chopin | Franz Schubert | Domenico Scarlatti | ... | Arnold Schoenberg | Bruno Mantovani | Antonín Dvořák | Piotr Ilitch Tchaïkovski | Johann Christian Bach | Aaron Copland | Ferruccio Busoni | Ralph Vaughan Williams | Zoltán Kodály | Leonard Bernstein | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 |
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 100 columns
Here a (quite-predictable) list of the top 10 most popular composer.
data.sum().sort_values(ascending=False)[0:10]
Wolfgang Amadeus Mozart 89 Ludwig van Beethoven 83 Johann Strauss 80 Antonio Vivaldi 79 Johann Sebastian Bach 76 Joseph Haydn 73 Georg Friedrich Haendel 71 Franz Liszt 71 Maurice Ravel 71 Giuseppe Verdi 70 dtype: int64
The 100 involved artists with label, uris and 17 embedding dimension coming from the music embeddings repo.
artists = pd.read_csv('artists.csv', index_col=0)
artists.head()
label | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uri | ||||||||||||||||||
http://data.doremus.org/artist/4802a043-23bb-3b8d-a443-4a3bd22ccc63 | Wolfgang Amadeus Mozart | -0.049424 | 0.012972 | 0.030435 | 0.672381 | 0.705714 | -0.003292 | 0.040397 | 0.033374 | 0.023954 | -0.085231 | 0.158234 | 0.044664 | 0.018166 | 0.010425 | -0.111956 | 0.142959 | -0.030154 |
http://data.doremus.org/artist/aabcd2ee-ac9b-30f2-8096-e9de8b3c7a81 | Franz Liszt | 0.000628 | -0.008797 | -0.007513 | 0.724762 | 0.796191 | 0.010984 | 0.017313 | 0.072120 | 0.025766 | -0.085152 | 0.156424 | 0.043058 | 0.020398 | 0.032555 | -0.048986 | 0.033035 | -0.001050 |
http://data.doremus.org/artist/12fa21ff-cfa4-31d6-87d9-a22315193b04 | Joseph Haydn | -2.000000 | -2.000000 | -2.000000 | 0.649524 | 0.722857 | -0.001422 | 0.035583 | 0.029520 | 0.023965 | -0.085292 | 0.158400 | 0.042192 | 0.008899 | 0.007677 | -0.091435 | 0.147538 | 0.000403 |
http://data.doremus.org/artist/f9a2ac39-a62d-3be2-8abb-e564de0ec96d | Johannes Brahms | -0.010495 | -0.003960 | 0.000920 | 0.745714 | 0.806667 | 0.003321 | 0.019724 | 0.059037 | 0.024170 | -0.085409 | 0.158257 | 0.044797 | 0.028943 | 0.022953 | -0.063101 | 0.111707 | -0.025850 |
http://data.doremus.org/artist/f753314d-87a7-32a9-9218-da98ae4f9812 | Robert Schumann | 0.000628 | -0.008797 | -0.007513 | 0.723810 | 0.767619 | 0.003386 | 0.021836 | 0.059332 | 0.023935 | -0.084914 | 0.158104 | 0.045242 | 0.030560 | 0.023502 | -0.129961 | 0.110820 | -0.069069 |
# returns the list of items which a given user has interacted to
def get_items(id):
user = data.loc[id]
return user[user[:] == 1].axes[0].tolist()
# example user 7
get_items(7)
['Wolfgang Amadeus Mozart', 'Joseph Haydn', 'Robert Schumann', 'Antonio Vivaldi', 'Franz Schubert', 'Georg Philipp Telemann', 'Ludwig van Beethoven', 'Alessandro Scarlatti', 'Benjamin Britten', 'Johann Sebastian Bach', 'Richard Wagner', 'Luigi Cherubini', 'Giuseppe Verdi', 'Johann Strauss', 'Niccolò Paganini', 'Gaetano Donizetti', 'Jean-Baptiste Lully', 'George Gershwin', 'Piotr Ilitch Tchaïkovski']
# retrieve the embeddings for an artists given a label
def get_emb(label):
a = artists.loc[artists['label'] == label]
embs = a.drop('label', axis=1).values[0]
return np.ma.array(embs, mask=embs == -2.)
# example
get_emb('Wolfgang Amadeus Mozart')
masked_array(data=[-0.049423877149820335, 0.012972225435078144, 0.030434519052505493, 0.6723809242248535, 0.7057142853736877, -0.003291688393801451, 0.04039749875664711, 0.03337432071566582, 0.023954134434461597, -0.08523057401180267, 0.1582336723804474, 0.04466380551457405, 0.018166353926062584, 0.01042507402598858, -0.11195577681064606, 0.14295919239521027, -0.030153987929224968], mask=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], fill_value=1e+20)
# Find most similar users to the given one
def most_similar_users(user, k):
user_vec = user.values # user listening
# search among all the other users
pool = data.drop(user.name)
# apply the cosine distance to each element of the pool, and sort accordingly
pool['distance'] = data.apply(lambda u: distance.cosine(user_vec, u.values), axis=1)
pool = pool.sort_values('distance').drop('distance', 1)
# return the first k users
return pool[:k]
# Select the most popular artists among a subset of users
def most_popular_among(user_subset, k=10):
return user_subset.sum().sort_values(ascending=False).index.tolist()
# Recommend artists by looking at similar users
def collaborative_filtering(user, k=10):
_user = data.loc[user]
# find k most similar users
similar_users = most_similar_users(_user, k)
# get k closest items
most_popular = most_popular_among(similar_users)
# remove the ones already in the list
prediction = [x for x in most_popular if x not in get_items(user)]
return prediction[:k]
Which artist would be recommended to our user? He already listened these ones:
user_example = 8
get_items(user_example)
['Wolfgang Amadeus Mozart', 'Frédéric Chopin', 'Franz Schubert', 'Ludwig van Beethoven', 'Carl Philipp Emanuel Bach', 'Richard Strauss', 'Francis Poulenc', 'Maurice Ravel', 'Felix Mendelssohn Bartholdy', 'Giuseppe Verdi', 'Igor Stravinsky', 'Georg Friedrich Haendel']
The recommendation proposes other Germans composers
collaborative_filtering(user_example)
['Johann Strauss', 'Franz Liszt', 'Johannes Brahms', 'Richard Wagner', 'Claude Debussy', 'Johann Sebastian Bach', 'Gustav Mahler', 'Luigi Cherubini', 'Jean-Baptiste Lully', 'Antonio Vivaldi']
Define similarity metric
def compute_similarity(seed, target, w=1):
b1 = np.where(seed.mask==True)[0]
b2 = np.where(target.mask==True)[0]
bad_pos = np.unique(np.concatenate([b1, b2]))
_seed = np.delete(seed, bad_pos, axis=0)
_target = np.delete(target, bad_pos, axis=0)
_w = np.delete(w, bad_pos, axis=0)
if len(_seed) == 0:
return 0
# distance
d = weighted_l2(_seed, _target, _w)
# how much info I am not finding
penalty = len([x for x in b2 if x not in b1]) / len(seed)
# score
s = (max_distance - d) / max_distance
return s * (1 - penalty)
def weighted_l2(a, b, w=1):
q = a - b
return np.sqrt((w * q * q).sum())
_ones = np.ones(17)
max_distance = weighted_l2(_ones,-_ones, _ones)
Compute all the similarity scores between couple of artists and put them in a Data Frame.
similarity_matrix = pd.DataFrame(index=artists['label'], columns=artists['label'])
for i in np.arange(len(similarity_matrix)):
seed = artists.iloc[i]['label']
for j in np.arange(len(similarity_matrix)):
if i == j:
similarity_matrix.iloc[i][j] = 1
continue
target = artists.iloc[j]['label']
similarity_matrix.iloc[i][j] = compute_similarity(get_emb(seed), get_emb(target))
similarity_matrix.head()
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:8: DeprecationWarning: in the future the special handling of scalars will be removed from delete and raise an error
label | Wolfgang Amadeus Mozart | Franz Liszt | Joseph Haydn | Johannes Brahms | Robert Schumann | Antonio Vivaldi | Roland de Lassus | Frédéric Chopin | Franz Schubert | Domenico Scarlatti | ... | Arnold Schoenberg | Bruno Mantovani | Antonín Dvořák | Piotr Ilitch Tchaïkovski | Johann Christian Bach | Aaron Copland | Ferruccio Busoni | Ralph Vaughan Williams | Zoltán Kodály | Leonard Bernstein |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
label | |||||||||||||||||||||
Wolfgang Amadeus Mozart | 1 | 0.977317 | 0.818705 | 0.981598 | 0.985071 | 0.814204 | 0.624089 | 0.976958 | 0.987308 | 0.813783 | ... | 0.803884 | 0.573075 | 0.808275 | 0.808963 | 0.818815 | 0.628535 | 0.805652 | 0.80302 | 0.797534 | 0.627795 |
Franz Liszt | 0.977317 | 1 | 0.806518 | 0.989032 | 0.983511 | 0.798843 | 0.616185 | 0.98243 | 0.979911 | 0.800846 | ... | 0.807869 | 0.576931 | 0.813884 | 0.813808 | 0.805256 | 0.6364 | 0.807672 | 0.80757 | 0.798923 | 0.635377 |
Joseph Haydn | 0.994142 | 0.979343 | 1 | 0.982494 | 0.984184 | 0.988096 | 0.794793 | 0.977726 | 0.988863 | 0.990187 | ... | 0.975743 | 0.742932 | 0.981111 | 0.982625 | 0.993293 | 0.799882 | 0.976397 | 0.974162 | 0.970276 | 0.798628 |
Johannes Brahms | 0.981598 | 0.989032 | 0.809113 | 1 | 0.988774 | 0.800753 | 0.614646 | 0.979436 | 0.982709 | 0.802281 | ... | 0.815153 | 0.578398 | 0.818782 | 0.817388 | 0.807047 | 0.638088 | 0.81323 | 0.811394 | 0.803853 | 0.637382 |
Robert Schumann | 0.985071 | 0.983511 | 0.810504 | 0.988774 | 1 | 0.804982 | 0.617933 | 0.976734 | 0.981611 | 0.804426 | ... | 0.810159 | 0.576856 | 0.813493 | 0.812816 | 0.810574 | 0.634682 | 0.814013 | 0.809981 | 0.798908 | 0.634023 |
5 rows × 100 columns
def content_based(user, k=10):
_items = get_items(user)
# remove the items already in the list
candidates = similarity_matrix.drop(labels=_items, axis=1)
# choose the artists that maximise the similarity among all the items
candidates = candidates.loc[_items]
return candidates.sum().sort_values(ascending=False).index.tolist()[0:k]
get_items(user_example)
['Wolfgang Amadeus Mozart', 'Frédéric Chopin', 'Franz Schubert', 'Ludwig van Beethoven', 'Carl Philipp Emanuel Bach', 'Richard Strauss', 'Francis Poulenc', 'Maurice Ravel', 'Felix Mendelssohn Bartholdy', 'Giuseppe Verdi', 'Igor Stravinsky', 'Georg Friedrich Haendel']
content_based(user_example)
['Johannes Brahms', 'Claude Debussy', 'Robert Schumann', 'Bedřich Smetana', 'César Franck', 'Jean Sibelius', 'Carl Maria von Weber', 'Gabriel Fauré', 'Edward Elgar', 'Franz Liszt']
What happens with a user that appreciate just a particular composer?
new_user = np.zeros(len(data.loc[0]))
new_user_id = len(data)
data.loc[new_user_id] = new_user
data.loc[new_user_id]['Antonio Vivaldi'] = 1
data.loc[[new_user_id]]
Wolfgang Amadeus Mozart | Franz Liszt | Joseph Haydn | Johannes Brahms | Robert Schumann | Antonio Vivaldi | Roland de Lassus | Frédéric Chopin | Franz Schubert | Domenico Scarlatti | ... | Arnold Schoenberg | Bruno Mantovani | Antonín Dvořák | Piotr Ilitch Tchaïkovski | Johann Christian Bach | Aaron Copland | Ferruccio Busoni | Ralph Vaughan Williams | Zoltán Kodály | Leonard Bernstein | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
100 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 rows × 100 columns
collaborative_filtering(new_user_id)
['Wolfgang Amadeus Mozart', 'Johann Strauss', 'Claudio Monteverdi', 'Joseph Haydn', 'Piotr Ilitch Tchaïkovski', 'Georg Philipp Telemann', 'Johannes Brahms', 'Johann Sebastian Bach', 'Niccolò Paganini', 'Franz Liszt']
content_based(new_user_id)
['Alessandro Scarlatti', 'Johann Sebastian Bach', 'Georg Friedrich Haendel', 'François Couperin', 'Henry Purcell', 'Carl Philipp Emanuel Bach', 'Domenico Scarlatti', 'Georg Philipp Telemann', 'Baldassare Galuppi', 'André Campra']