In this notebook we will be experimenting with the embedding features. Using the faiss
package we can efficiently find similar movies based on the cosine similarity.
import pandas as pd
import numpy as np
from src.similarity import create_search_index, find_top_similar, compute_cosine_similarity
Here we will get the 30 nearest neighbors for each movie considering content based and collaborative filtering embedding features.
content_meta = pd.read_csv('output/content_embedding_meta.tsv', sep='\t')
cb_meta = pd.read_csv('output/cb_embedding_meta.tsv', sep='\t')
col_names = ['factor_{}'.format(i) for i in range(300)]
cb_embedding = pd.read_csv('output/collab_filt_vectors.tsv', sep='\t', names=col_names)
tfidf_embedding = pd.read_csv('output/tfidf_embedding_vectors.tsv', sep='\t', names=col_names)
tfidf_crew_embedding = pd.read_csv('output/tfidf_crew_embedding_vectors.tsv', sep='\t', names=col_names)
tfidf_crew_cast_embedding = pd.read_csv('output/tfidf_crew_cast_embedding_vectors.tsv', sep='\t', names=col_names)
cb_embedding.head()
factor_0 | factor_1 | factor_2 | factor_3 | factor_4 | factor_5 | factor_6 | factor_7 | factor_8 | factor_9 | ... | factor_290 | factor_291 | factor_292 | factor_293 | factor_294 | factor_295 | factor_296 | factor_297 | factor_298 | factor_299 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.13718 | -0.03387 | -0.03017 | 0.05062 | 0.03955 | -0.00645 | -0.16905 | 0.03667 | 0.06681 | -0.00628 | ... | 0.04458 | -0.00807 | -0.02188 | 0.01180 | 0.06933 | 0.11840 | -0.09077 | 0.07540 | 0.00299 | -0.07051 |
1 | 0.03843 | 0.02700 | -0.01565 | 0.02153 | -0.09215 | 0.00628 | 0.00412 | -0.04694 | 0.07562 | -0.04912 | ... | -0.01289 | 0.06133 | -0.12022 | -0.02296 | -0.00038 | 0.10698 | -0.09957 | 0.01236 | 0.07305 | -0.00638 |
2 | 0.07394 | -0.06563 | -0.05454 | 0.03857 | 0.06494 | -0.03999 | -0.12235 | -0.07590 | 0.08604 | -0.01678 | ... | -0.05933 | 0.02223 | -0.02603 | 0.00313 | 0.08757 | 0.00446 | -0.05378 | -0.09055 | 0.08543 | -0.05654 |
3 | 0.05019 | -0.07952 | -0.03727 | 0.05436 | -0.09217 | -0.02115 | -0.06872 | -0.03769 | 0.04223 | -0.06436 | ... | 0.00114 | 0.04767 | -0.15716 | -0.04998 | 0.01877 | 0.08810 | -0.11753 | -0.02773 | 0.05437 | -0.03121 |
4 | 0.00977 | 0.00258 | -0.04626 | -0.03330 | -0.04444 | 0.14469 | 0.09473 | 0.02177 | -0.00838 | 0.01385 | ... | 0.03475 | 0.06998 | -0.03812 | 0.02543 | -0.00995 | 0.09979 | -0.02725 | -0.03844 | 0.00601 | 0.03093 |
5 rows × 300 columns
%%time
cb_index = create_search_index(cb_embedding)
CPU times: user 20 ms, sys: 20 ms, total: 40 ms Wall time: 37.1 ms
%%time
similars = find_top_similar([260], cb_index, 15, cb_embedding, cb_meta)
similars.head()
CPU times: user 368 ms, sys: 32 ms, total: 400 ms Wall time: 120 ms
id_right | similarity | id_left | |
---|---|---|---|
836 | 1196 | 0.898831 | 260 |
847 | 1210 | 0.805315 | 260 |
9828 | 122886 | 0.696018 | 260 |
10564 | 166528 | 0.646935 | 260 |
431 | 561 | 0.544250 | 260 |
display(cb_meta.loc[cb_meta['id']==260])
display(pd.merge(similars, cb_meta, left_on='id_right', right_on='id', how='inner'))
id | title | |
---|---|---|
204 | 260 | Star Wars |
id_right | similarity | id_left | id | title | |
---|---|---|---|---|---|
0 | 1196 | 0.898831 | 260 | 1196 | The Empire Strikes Back |
1 | 1210 | 0.805315 | 260 | 1210 | Return of the Jedi |
2 | 122886 | 0.696018 | 260 | 122886 | Star Wars: The Force Awakens |
3 | 166528 | 0.646935 | 260 | 166528 | Rogue One: A Star Wars Story |
4 | 561 | 0.544250 | 260 | 561 | Killer |
5 | 31247 | 0.525195 | 260 | 31247 | The Fighting Sullivans |
6 | 33493 | 0.487151 | 260 | 33493 | Star Wars: Episode III - Revenge of the Sith |
7 | 5199 | 0.470113 | 260 | 5199 | The Long Riders |
8 | 136485 | 0.467294 | 260 | 136485 | Robot Chicken: Star Wars |
9 | 168026 | 0.452794 | 260 | 168026 | Marvel One-Shot: Agent Carter |
10 | 26502 | 0.432812 | 260 | 26502 | A Christmas Carol |
11 | 6528 | 0.431993 | 260 | 6528 | Start the Revolution Without Me |
12 | 5378 | 0.426094 | 260 | 5378 | Star Wars: Episode II - Attack of the Clones |
13 | 44041 | 0.424542 | 260 | 44041 | All Quiet on the Western Front |
14 | 2628 | 0.418522 | 260 | 2628 | Star Wars: Episode I - The Phantom Menace |
%%time
tfidf_index = create_search_index(tfidf_embedding)
CPU times: user 440 ms, sys: 48 ms, total: 488 ms Wall time: 72.1 ms
%%time
similars = find_top_similar([260], tfidf_index, 15, tfidf_embedding, content_meta)
similars.head()
CPU times: user 396 ms, sys: 12 ms, total: 408 ms Wall time: 42.5 ms
id_right | similarity | id_left | |
---|---|---|---|
845 | 1196 | 0.900626 | 260 |
856 | 1210 | 0.876032 | 260 |
3921 | 5378 | 0.808551 | 260 |
1923 | 2628 | 0.795053 | 260 |
6935 | 33493 | 0.787766 | 260 |
display(content_meta.loc[content_meta['id']==260])
display(pd.merge(similars, content_meta, left_on='id_right', right_on='id', how='inner'))
id | title | |
---|---|---|
204 | 260 | Star Wars |
id_right | similarity | id_left | id | title | |
---|---|---|---|---|---|
0 | 1196 | 0.900626 | 260 | 1196 | The Empire Strikes Back |
1 | 1210 | 0.876032 | 260 | 1210 | Return of the Jedi |
2 | 5378 | 0.808551 | 260 | 5378 | Star Wars: Episode II - Attack of the Clones |
3 | 2628 | 0.795053 | 260 | 2628 | Star Wars: Episode I - The Phantom Menace |
4 | 33493 | 0.787766 | 260 | 33493 | Star Wars: Episode III - Revenge of the Sith |
5 | 1200 | 0.697495 | 260 | 1200 | Aliens |
6 | 113345 | 0.697349 | 260 | 113345 | Jupiter Ascending |
7 | 68358 | 0.674341 | 260 | 68358 | Star Trek |
8 | 122886 | 0.665957 | 260 | 122886 | Star Wars: The Force Awakens |
9 | 112852 | 0.659662 | 260 | 112852 | Guardians of the Galaxy |
10 | 102445 | 0.641535 | 260 | 102445 | Star Trek Into Darkness |
11 | 1097 | 0.640180 | 260 | 1097 | E.T. the Extra-Terrestrial |
12 | 166528 | 0.635602 | 260 | 166528 | Rogue One: A Star Wars Story |
13 | 316 | 0.629126 | 260 | 316 | Stargate |
14 | 1214 | 0.626877 | 260 | 1214 | Alien |
To reduce the seacrh space, let's consider just those movies we have selected when computing the similarity by user correlation:
target = pd.read_csv('output/movie_similarity.csv', usecols=['id_left','id_right','similarity'])
target_ids = set(target['id_left'])
target_ids = target_ids.intersection(set(content_meta['id'].values))
target_ids = list(target_ids.intersection(set(cb_meta['id'].values)))
print(target_ids)
[2, 122886, 21, 57368, 2080, 34, 122918, 77866, 6187, 50, 2100, 65588, 116797, 62, 69, 70, 72, 6218, 2132, 2133, 2137, 30812, 2144, 6242, 26729, 59501, 110, 111, 112, 2160, 4223, 161922, 140, 145, 2194, 150, 161, 162, 165, 8360, 86190, 6323, 6333, 6339, 2247, 2253, 6349, 208, 4306, 4318, 223, 6377, 235, 2288, 247, 253, 2302, 55553, 6403, 260, 292, 296, 2352, 2353, 316, 318, 57669, 329, 333, 337, 8533, 2394, 2395, 4444, 350, 4446, 4447, 4452, 364, 367, 4465, 2420, 380, 4478, 6541, 2467, 6565, 441, 2490, 8633, 45499, 8641, 8643, 457, 104913, 94677, 8665, 4571, 6620, 76251, 480, 481, 6636, 493, 2542, 47610, 115210, 2571, 2572, 527, 2580, 4641, 551, 2599, 555, 6713, 94777, 588, 589, 593, 594, 4700, 608, 4718, 6769, 43635, 635, 2683, 166526, 8831, 31364, 2694, 647, 68237, 49815, 2716, 60069, 60074, 4782, 4801, 53953, 2763, 33493, 4823, 733, 736, 103141, 2791, 6893, 750, 60141, 8949, 778, 2826, 4874, 4878, 784, 786, 4886, 6934, 2840, 4901, 2858, 832, 838, 6987, 103249, 37729, 2924, 4973, 7022, 4975, 2941, 4993, 904, 908, 2959, 913, 5010, 33683, 50068, 920, 2968, 924, 2973, 62376, 2985, 2987, 2990, 109487, 95167, 72641, 3033, 996, 7147, 7153, 7154, 5107, 134130, 7162, 3074, 3082, 1042, 97304, 3101, 74789, 3114, 1073, 1089, 1090, 1091, 1092, 1093, 1097, 3147, 1101, 27728, 3160, 5218, 3175, 134248, 64620, 1136, 3210, 25750, 1176, 31900, 44191, 44195, 1193, 5291, 1196, 1197, 1201, 3253, 1206, 1208, 1209, 1210, 1214, 1215, 1216, 3264, 58559, 83134, 1222, 1225, 3273, 3275, 1231, 1235, 42197, 1240, 1244, 1245, 1246, 1252, 5349, 54503, 60647, 1258, 1259, 1266, 79091, 1268, 1270, 1272, 1278, 34048, 5377, 1285, 1287, 1288, 48394, 1291, 87306, 85261, 7438, 1307, 3362, 72998, 5418, 1333, 3384, 7482, 1343, 1347, 5445, 93510, 64839, 132424, 3408, 3409, 5463, 1370, 3421, 1374, 91500, 1390, 66934, 89470, 48516, 56715, 3471, 79251, 3499, 5580, 3536, 3555, 46578, 52722, 1527, 5629, 44555, 106002, 5669, 1594, 1608, 1610, 1619, 3671, 1625, 3676, 3677, 48738, 34405, 1639, 63082, 3699, 3708, 3713, 1673, 1676, 59022, 1682, 52885, 1693, 5791, 3751, 1704, 3753, 50872, 1732, 134853, 1735, 52973, 5881, 53000, 69406, 3880, 1833, 104241, 1848, 1857, 32587, 48982, 1887, 71520, 1892, 48997, 3948, 1907, 3988, 3996, 1953, 1956, 1959, 112552, 1961, 1963, 6059, 1968, 81845, 81847, 4025, 4034, 2009, 2011, 2012, 2028, 30707, 4085]
%%time
cb_similars = find_top_similar(target_ids, cb_index, 30, cb_embedding, cb_meta)
cb_similars.to_csv('output/movie_similarity_cb.csv', index=False)
CPU times: user 4.74 s, sys: 188 ms, total: 4.92 s Wall time: 555 ms
%%time
tfidf_similars = find_top_similar(target_ids, tfidf_index, 30, tfidf_embedding, content_meta)
tfidf_similars.to_csv('output/movie_similarity_tfidf.csv', index=False)
CPU times: user 2.91 s, sys: 44 ms, total: 2.95 s Wall time: 155 ms
%%time
tfidf_crew_index = create_search_index(tfidf_crew_embedding)
tfidf_crew_similars = find_top_similar(target_ids, tfidf_crew_index, 30, tfidf_crew_embedding, content_meta)
tfidf_crew_similars.to_csv('output/movie_similarity_tfidf_crew.csv', index=False)
CPU times: user 6.68 s, sys: 200 ms, total: 6.88 s Wall time: 398 ms
%%time
tfidf_crew_cast_index = create_search_index(tfidf_crew_cast_embedding)
tfidf_crew_cast_similars = find_top_similar(target_ids, tfidf_crew_cast_index, 30, tfidf_crew_cast_embedding, content_meta)
tfidf_crew_cast_similars.to_csv('output/movie_similarity_tfidf_crew_cast.csv', index=False)
CPU times: user 5.8 s, sys: 168 ms, total: 5.97 s Wall time: 381 ms
Now, let's apply the embedding features to compute the similarity between a pair of movies. We will consider the same pairs we found by user correlation:
right_ids = set(target['id_right'])
right_ids = right_ids.intersection(set(content_meta['id'].values))
right_ids = list(right_ids.intersection(set(cb_meta['id'].values)))
target_new = target.loc[target['id_left'].isin(target_ids) & target['id_right'].isin(right_ids)].copy()
target_new.rename(columns={'similarity':'target'}, inplace=True)
target_new.set_index('id_right', inplace=True)
target_new.head()
id_left | target | |
---|---|---|
id_right | ||
364 | 2 | 0.335812 |
480 | 2 | 0.317939 |
367 | 2 | 0.314605 |
588 | 2 | 0.298155 |
589 | 2 | 0.263034 |
%%time
parts = []
for left_id in target_new['id_left'].unique():
selected = target_new.loc[target_new['id_left']==left_id, ['target']]
similars = compute_cosine_similarity([left_id], selected.index, cb_embedding, cb_meta)
similars.set_index('id_right', drop=False, inplace=True)
parts.append(similars.join(selected, how='inner'))
cb_ranking = pd.concat(parts, axis=0, ignore_index=True)
cb_ranking.to_csv('output/movie_ranking_cb.csv', index=False)
cb_ranking.head()
CPU times: user 3.26 s, sys: 0 ns, total: 3.26 s Wall time: 3.26 s
id_right | similarity | id_left | target | |
---|---|---|---|---|
0 | 253 | 0.343631 | 2 | 0.237054 |
1 | 110 | 0.289399 | 2 | 0.242349 |
2 | 2011 | 0.279259 | 2 | 0.248665 |
3 | 165 | 0.244945 | 2 | 0.234542 |
4 | 1097 | 0.237925 | 2 | 0.233132 |
%%time
parts = []
for left_id in target_new['id_left'].unique():
selected = target_new.loc[target_new['id_left']==left_id, ['target']]
similars = compute_cosine_similarity([left_id], selected.index, tfidf_embedding, content_meta)
similars.set_index('id_right', drop=False, inplace=True)
parts.append(similars.join(selected, how='inner'))
tfidf_ranking = pd.concat(parts, axis=0, ignore_index=True)
tfidf_ranking.to_csv('output/movie_ranking_tfidf.csv', index=False)
tfidf_ranking.head()
CPU times: user 2.9 s, sys: 0 ns, total: 2.9 s Wall time: 2.9 s
id_right | similarity | id_left | target | |
---|---|---|---|---|
0 | 4886 | 0.433568 | 2 | 0.235485 |
1 | 110 | 0.401254 | 2 | 0.242349 |
2 | 4306 | 0.387088 | 2 | 0.240394 |
3 | 1097 | 0.383907 | 2 | 0.233132 |
4 | 1291 | 0.352362 | 2 | 0.232228 |
%%time
parts = []
for left_id in target_new['id_left'].unique():
selected = target_new.loc[target_new['id_left']==left_id, ['target']]
similars = compute_cosine_similarity([left_id], selected.index, tfidf_crew_embedding, content_meta)
similars.set_index('id_right', drop=False, inplace=True)
parts.append(similars.join(selected, how='inner'))
tfidf_crew_ranking = pd.concat(parts, axis=0, ignore_index=True)
tfidf_crew_ranking.to_csv('output/movie_ranking_tfidf_crew.csv', index=False)
tfidf_crew_ranking.head()
CPU times: user 2.75 s, sys: 0 ns, total: 2.75 s Wall time: 2.74 s
id_right | similarity | id_left | target | |
---|---|---|---|---|
0 | 6377 | 0.382239 | 2 | 0.229354 |
1 | 4306 | 0.336810 | 2 | 0.240394 |
2 | 364 | 0.310683 | 2 | 0.335812 |
3 | 1097 | 0.279052 | 2 | 0.233132 |
4 | 4886 | 0.272782 | 2 | 0.235485 |
%%time
parts = []
for left_id in target_new['id_left'].unique():
selected = target_new.loc[target_new['id_left']==left_id, ['target']]
similars = compute_cosine_similarity([left_id], selected.index, tfidf_crew_cast_embedding, content_meta)
similars.set_index('id_right', drop=False, inplace=True)
parts.append(similars.join(selected, how='inner'))
tfidf_crew_cast_ranking = pd.concat(parts, axis=0, ignore_index=True)
tfidf_crew_cast_ranking.to_csv('output/movie_ranking_tfidf_crew_cast.csv', index=False)
tfidf_crew_cast_ranking.head()
CPU times: user 2.93 s, sys: 0 ns, total: 2.93 s Wall time: 2.93 s
id_right | similarity | id_left | target | |
---|---|---|---|---|
0 | 4306 | 0.384327 | 2 | 0.240394 |
1 | 2716 | 0.223461 | 2 | 0.222825 |
2 | 364 | 0.164086 | 2 | 0.335812 |
3 | 736 | 0.158615 | 2 | 0.256185 |
4 | 2011 | 0.147242 | 2 | 0.248665 |