import scipy.sparse as sparse
import pandas as pd
import numpy as np
play_count_matrix = sparse.load_npz('play_count_matrix.npz')
metadata = pd.read_hdf('metadata.hdf', key='data')
두 노래를 모두 들어본 사람의 수를 구하기 위해서는 먼저 '어떤 노래를 들어본 적 있는가? -> 1, 그렇지 않으면 -> 0' 과 같은 형태로 플레이 카운트 행렬을 이진화 해야 합니다.
이진화를 하고 나면, 간단하게 X * X.T를 수행함으로써 두 노래를 모두 들어본 사람의 수를 계산할 수 있습니다.
# 이진화
play_count_binary = play_count_matrix.copy().tocsr()
play_count_binary[play_count_binary > 0] = 1
# 교집합 계산
all_pair_intersections = (play_count_binary * play_count_binary.transpose())
all_pair_intersections = np.array(all_pair_intersections.todense())
all_pair_intersections.shape
(10000, 10000)
def query_intersection(song_name, return_k=10):
idx = metadata[metadata.title == song_name].song.cat.codes.values[0]
cand = all_pair_intersections[idx].argsort()[::-1][:return_k+1]
cand = list(cand[cand != idx][:return_k])
return [idx] + cand # 맨 위에는 쿼리의 대상이 된 곡을 넣어줍니다.
metadata.loc[query_intersection('You Belong With Me')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
7330 | TRJPXVB128F9316916 | SOSROFB12AAF3B4C5D | Taylor Swift | You Belong With Me | Country_Traditional | Country |
7761 | TRSDRPY128F933E202 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | NaN | NaN |
7761 | TROPUMP128F92EC162 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | Country_Traditional | Country |
5509 | TROAQBZ128F9326213 | SONYKOW12AB01849C9 | OneRepublic | Secrets | Rock_Contemporary | Pop_Rock |
4414 | TRVSBTV12903CC6670 | SOLFXKT12AB017E3E0 | Charttraxx Karaoke | Fireflies | NaN | NaN |
355 | TRHKJNX12903CEFCDF | SOAXGDH12A8C13F8A1 | Florence + The Machine | Dog Days Are Over (Radio Edit) | NaN | NaN |
8567 | TRIEXMF128F92FDD60 | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | Pop_Rock |
8567 | TRTEHXL128F931687B | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | NaN |
620 | TRCPAGR128F423A01A | SOBOAFP12A8C131F36 | Jason Mraz & Colbie Caillat | Lucky (Album Version) | NaN | NaN |
8065 | TRSLDDC12903CC36E7 | SOUSMXX12AB0185C24 | Usher featuring will.i.am | OMG | NaN | NaN |
4254 | TRCBRTN12903CC4BD1 | SOKUPAO12AB018D576 | Paramore | The Only Exception (Album Version) | NaN | NaN |
7395 | TRONYHY128F92C9D11 | SOSXLTC12AF72A7F54 | Kings Of Leon | Revelry | Rock_College | Pop_Rock |
6216 | TROHFJK12903CC4BCE | SOPTLQL12AB018D56F | Travie McCoy | Billionaire [feat. Bruno Mars] (Explicit Albu... | NaN | NaN |
metadata.loc[query_intersection('Fix You')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
8597 | TRYVBMA128E0789D39 | SOWEJXA12A6701C574 | Coldplay | Fix You | NaN | NaN |
1125 | TRENTGL128E0780C8E | SOCVTLJ12A6310F0FD | Coldplay | Clocks | Rock_College | Pop_Rock |
4114 | TRQFXKD128E0780CAE | SOKLRPJ12A8C13C3FE | Coldplay | The Scientist | Pop_Contemporary | Pop_Rock |
6284 | TRIKGRK128E0780DB0 | SOPXKYD12A6D4FA876 | Coldplay | Yellow | Pop_Contemporary | Pop_Rock |
6284 | TRTZNQZ12903CD044C | SOPXKYD12A6D4FA876 | Coldplay | Yellow | NaN | NaN |
5509 | TROAQBZ128F9326213 | SONYKOW12AB01849C9 | OneRepublic | Secrets | Rock_Contemporary | Pop_Rock |
355 | TRHKJNX12903CEFCDF | SOAXGDH12A8C13F8A1 | Florence + The Machine | Dog Days Are Over (Radio Edit) | NaN | NaN |
8567 | TRIEXMF128F92FDD60 | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | Pop_Rock |
8567 | TRTEHXL128F931687B | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | NaN |
4414 | TRVSBTV12903CC6670 | SOLFXKT12AB017E3E0 | Charttraxx Karaoke | Fireflies | NaN | NaN |
7395 | TRONYHY128F92C9D11 | SOSXLTC12AF72A7F54 | Kings Of Leon | Revelry | Rock_College | Pop_Rock |
620 | TRCPAGR128F423A01A | SOBOAFP12A8C131F36 | Jason Mraz & Colbie Caillat | Lucky (Album Version) | NaN | NaN |
4254 | TRCBRTN12903CC4BD1 | SOKUPAO12AB018D576 | Paramore | The Only Exception (Album Version) | NaN | NaN |
metadata.loc[query_intersection('Welcome To The Black Parade (Album Version)')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
6562 | TRKJHNE128F42380FC | SOQQAAQ12A67ADE34D | My Chemical Romance | Welcome To The Black Parade (Album Version) | Grunge_Emo | Pop_Rock |
4714 | TRYZMOC128F423E58D | SOLXXZI12A8AE4733A | My Chemical Romance | Helena (So Long & Goodnight) (Album Version) | Grunge_Emo | Pop_Rock |
5509 | TROAQBZ128F9326213 | SONYKOW12AB01849C9 | OneRepublic | Secrets | Rock_Contemporary | Pop_Rock |
4414 | TRVSBTV12903CC6670 | SOLFXKT12AB017E3E0 | Charttraxx Karaoke | Fireflies | NaN | NaN |
6078 | TRRJZWL128F146D790 | SOPKPSQ12A58A7A5E4 | My Chemical Romance | I'm Not Okay (I Promise) (Live From Sessions@AOL) | Grunge_Emo | Pop_Rock |
968 | TRQTLTB128F92F785B | SOCKSGZ12A58A7CA4B | Linkin Park | Bleed It Out [Live At Milton Keynes] | NaN | NaN |
9932 | TRNTALF128EF343800 | SOZVCRW12A67ADA0B7 | The Killers | When You Were Young | NaN | NaN |
7935 | TRRNFHH128F92D262D | SOUJVIT12A8C1451C1 | Rise Against | Savior | Punk | Pop_Rock |
1125 | TRENTGL128E0780C8E | SOCVTLJ12A6310F0FD | Coldplay | Clocks | Rock_College | Pop_Rock |
355 | TRHKJNX12903CEFCDF | SOAXGDH12A8C13F8A1 | Florence + The Machine | Dog Days Are Over (Radio Edit) | NaN | NaN |
8567 | TRIEXMF128F92FDD60 | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | Pop_Rock |
8567 | TRTEHXL128F931687B | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | NaN |
위의 결과를 보면, 블로그에서와 똑같이, 유명한 곡이 유사도와 관계 없이 계속 상위권을 차지하는 문제를 볼 수 있습니다. 이를 약간 완화하는 방법에는 Jaccard Index가 있는데, 이는 단순히 교집합의 크기를 비교하는 것이 아니라, (교집합의 크기) / (합집합의 크기) 를 이용합니다. 즉, 집합의 크기로 노멀라이즈 하는 것이라고 생각해볼 수 있겠죠?
다행히 자카드 지수에 대한 쿼리 역시 교집합 행렬을 이용하면 간단하게 계산할 수 있습니다. 각 노래를 들어본 사람의 수를 벡터로 나타내고, 이를 행렬의 각 행에 적절히 더해주면 합집합의 크기도 간단하게! (참고: numpy broadcasting)
play_count_binary_per_song = play_count_binary.sum(axis=1)
play_count_binary_per_song.shape
(10000, 1)
all_pair_union = play_count_binary_per_song - all_pair_intersections + play_count_binary_per_song.transpose()
all_pair_union.shape
(10000, 10000)
all_pair_jaccard = np.array(all_pair_intersections / all_pair_union)
all_pair_jaccard.shape
(10000, 10000)
아까랑 똑같습니다. 그냥 정렬하면 끝.
def query_jaccard(song_name, return_k=10):
idx = metadata[metadata.title == song_name].song.cat.codes.values[0]
cand = all_pair_jaccard[idx].argsort()[::-1][:return_k+1]
cand = list(cand[cand != idx][:return_k])
return [idx] + cand
metadata.loc[query_jaccard('You Belong With Me')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
7330 | TRJPXVB128F9316916 | SOSROFB12AAF3B4C5D | Taylor Swift | You Belong With Me | Country_Traditional | Country |
7761 | TRSDRPY128F933E202 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | NaN | NaN |
7761 | TROPUMP128F92EC162 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | Country_Traditional | Country |
977 | TRGMZNT128F92DE267 | SOCLMAD12AB017FC09 | Taylor Swift | Tim McGraw | Country_Traditional | Country |
2664 | TRLVQME128F931BAF3 | SOGTQNI12AB0184A5C | Owl City | Vanilla Twilight | Grunge_Emo | Pop_Rock |
3418 | TRBTNPR12903D13765 | SOISNSU12AC468C0D8 | Adam Lambert | If I Had You | NaN | NaN |
2195 | TRBNYBX128F422EC61 | SOFRCGW12A81C21EA6 | Plain White T's | Hey There Delilah | NaN | NaN |
8691 | TRLCLEM128F93402D3 | SOWKQYL12AB0183B15 | Jason Derulo | Whatcha Say | Hip_Hop_Rap | RnB |
620 | TRCPAGR128F423A01A | SOBOAFP12A8C131F36 | Jason Mraz & Colbie Caillat | Lucky (Album Version) | NaN | NaN |
5676 | TRMOYCC128C7196947 | SOOJJCT12A6310E1C0 | 3 Doors Down | Here Without You | NaN | Pop_Rock |
2634 | TRMEQQX12903CCD9D5 | SOGSAYQ12AB018BA14 | Sean Kingston and Justin Bieber | Eenie Meenie | NaN | NaN |
4254 | TRCBRTN12903CC4BD1 | SOKUPAO12AB018D576 | Paramore | The Only Exception (Album Version) | NaN | NaN |
metadata.loc[query_jaccard('Fix You')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
8597 | TRYVBMA128E0789D39 | SOWEJXA12A6701C574 | Coldplay | Fix You | NaN | NaN |
4114 | TRQFXKD128E0780CAE | SOKLRPJ12A8C13C3FE | Coldplay | The Scientist | Pop_Contemporary | Pop_Rock |
1125 | TRENTGL128E0780C8E | SOCVTLJ12A6310F0FD | Coldplay | Clocks | Rock_College | Pop_Rock |
6284 | TRIKGRK128E0780DB0 | SOPXKYD12A6D4FA876 | Coldplay | Yellow | Pop_Contemporary | Pop_Rock |
6284 | TRTZNQZ12903CD044C | SOPXKYD12A6D4FA876 | Coldplay | Yellow | NaN | NaN |
7873 | TRXWAZC128F9314B3E | SOUFPNI12A8C142D19 | John Mayer | Heartbreak Warfare | Rock_Contemporary | Pop_Rock |
620 | TRCPAGR128F423A01A | SOBOAFP12A8C131F36 | Jason Mraz & Colbie Caillat | Lucky (Album Version) | NaN | NaN |
4254 | TRCBRTN12903CC4BD1 | SOKUPAO12AB018D576 | Paramore | The Only Exception (Album Version) | NaN | NaN |
8567 | TRIEXMF128F92FDD60 | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | Pop_Rock |
8567 | TRTEHXL128F931687B | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | NaN |
7755 | TRKRRUV128F92F20F1 | SOTWNDJ12A8C143984 | Train | Marry Me | NaN | NaN |
7761 | TRSDRPY128F933E202 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | NaN | NaN |
7761 | TROPUMP128F92EC162 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | Country_Traditional | Country |
4414 | TRVSBTV12903CC6670 | SOLFXKT12AB017E3E0 | Charttraxx Karaoke | Fireflies | NaN | NaN |
metadata.loc[query_jaccard('Welcome To The Black Parade (Album Version)')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
6562 | TRKJHNE128F42380FC | SOQQAAQ12A67ADE34D | My Chemical Romance | Welcome To The Black Parade (Album Version) | Grunge_Emo | Pop_Rock |
6078 | TRRJZWL128F146D790 | SOPKPSQ12A58A7A5E4 | My Chemical Romance | I'm Not Okay (I Promise) (Live From Sessions@AOL) | Grunge_Emo | Pop_Rock |
4714 | TRYZMOC128F423E58D | SOLXXZI12A8AE4733A | My Chemical Romance | Helena (So Long & Goodnight) (Album Version) | Grunge_Emo | Pop_Rock |
2583 | TRHWCVS128F14895A3 | SOGOZLT12A6D4FB302 | My Chemical Romance | Teenagers (Album Version) | Grunge_Emo | Pop_Rock |
8774 | TRWLOCY128F9338118 | SOWPLVJ12AB0183586 | Fall Out Boy | Sugar_ We're Goin Down | NaN | NaN |
8774 | TRSSSJW128F146D5DC | SOWPLVJ12AB0183586 | Fall Out Boy | Sugar_ We're Goin Down | NaN | NaN |
9415 | TRSJBHU128F1496F3E | SOYIOZB12A58A797FC | Fall Out Boy | This Ain't A Scene_ It's An Arms Race | NaN | NaN |
5882 | TRMAFWC128F423E58F | SOOXMSN12A58A7A8D3 | My Chemical Romance | To The End (Album Version) | Grunge_Emo | Pop_Rock |
3796 | TRZIQMY128F146E70D | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
3796 | TRJQKHL128F9304295 | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
3796 | TRYMVOW128F92ECE27 | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
8086 | TREFJVA128F423E591 | SOUTXQX12A8AE4734A | My Chemical Romance | The Ghost Of You (Album Version) | Grunge_Emo | Pop_Rock |
5711 | TRLOMKL128F4295BDA | SOOLYZQ12A6D4FA5B7 | The Red Jumpsuit Apparatus | Face Down (Album Version) | NaN | NaN |
9284 | TRVVSIM128F423E594 | SOXZEWL12A8AE47353 | My Chemical Romance | Thank You For The Venom (Album Version) | Grunge_Emo | Pop_Rock |
이제는 넓은 범위에서 봤을 때엔 비슷하다고 할 수 있는 노래가 나옵니다. 여전히 <You Belong With Me> 에서 <Whatcha Say> 가 나오는 등, 뜬금없는 경우가 존재하지만요.
def tf_idf(matrix):
tf = matrix.sqrt()
idf = 1 + np.log1p(matrix.shape[-1]) - np.log1p(np.bincount(matrix.tocoo().row))
return tf.multiply(idf.reshape(-1, 1)).tocsr()
play_count_tfidf = tf_idf(play_count_matrix)
def cosine_similarity_sparse(v1, v2):
return v1.dot(v2.transpose())[0, 0] / np.sqrt(v1.dot(v1.transpose())[0, 0]) / np.sqrt(v2.dot(v2.transpose())[0, 0])
사실 이렇게 큰 벡터를 비교할 때에는 일일이 비교하기보다 Approximate Nearest Neighbor를 사용하는게 상식적인 접근이지만, 우선은 그냥 해 봅시다.
def query_tfidf(song_name, return_k=10):
idx = metadata[metadata.title == song_name].song.cat.codes.values[0]
target = play_count_tfidf[idx]
sim = np.array([cosine_similarity_sparse(target, play_count_tfidf[i]) for i in range(play_count_tfidf.shape[0])], dtype=np.float64)
cand = sim.argsort()[::-1][:return_k+1]
cand = list(cand[cand != idx][:return_k])
return [idx] + cand
metadata.loc[query_tfidf('You Belong With Me')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
7330 | TRJPXVB128F9316916 | SOSROFB12AAF3B4C5D | Taylor Swift | You Belong With Me | Country_Traditional | Country |
7761 | TRSDRPY128F933E202 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | NaN | NaN |
7761 | TROPUMP128F92EC162 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | Country_Traditional | Country |
977 | TRGMZNT128F92DE267 | SOCLMAD12AB017FC09 | Taylor Swift | Tim McGraw | Country_Traditional | Country |
5241 | TRFSCMH128F425CB85 | SONHLJN12A81C2169B | Mickie Krause | Orange Trägt Nur Die Müllabfuhr (Go West) | Pop_Contemporary | Pop_Rock |
4414 | TRVSBTV12903CC6670 | SOLFXKT12AB017E3E0 | Charttraxx Karaoke | Fireflies | NaN | NaN |
6216 | TROHFJK12903CC4BCE | SOPTLQL12AB018D56F | Travie McCoy | Billionaire [feat. Bruno Mars] (Explicit Albu... | NaN | NaN |
8474 | TRGCHLH12903CB7352 | SOVWADY12AB0189C63 | Miley Cyrus | Party In The U.S.A. | NaN | Pop_Rock |
620 | TRCPAGR128F423A01A | SOBOAFP12A8C131F36 | Jason Mraz & Colbie Caillat | Lucky (Album Version) | NaN | NaN |
5509 | TROAQBZ128F9326213 | SONYKOW12AB01849C9 | OneRepublic | Secrets | Rock_Contemporary | Pop_Rock |
4254 | TRCBRTN12903CC4BD1 | SOKUPAO12AB018D576 | Paramore | The Only Exception (Album Version) | NaN | NaN |
2195 | TRBNYBX128F422EC61 | SOFRCGW12A81C21EA6 | Plain White T's | Hey There Delilah | NaN | NaN |
metadata.loc[query_tfidf('Fix You')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
8597 | TRYVBMA128E0789D39 | SOWEJXA12A6701C574 | Coldplay | Fix You | NaN | NaN |
4114 | TRQFXKD128E0780CAE | SOKLRPJ12A8C13C3FE | Coldplay | The Scientist | Pop_Contemporary | Pop_Rock |
1125 | TRENTGL128E0780C8E | SOCVTLJ12A6310F0FD | Coldplay | Clocks | Rock_College | Pop_Rock |
6284 | TRIKGRK128E0780DB0 | SOPXKYD12A6D4FA876 | Coldplay | Yellow | Pop_Contemporary | Pop_Rock |
6284 | TRTZNQZ12903CD044C | SOPXKYD12A6D4FA876 | Coldplay | Yellow | NaN | NaN |
7944 | TRYNYSX128E07897B3 | SOUKJBT12A6701C4D6 | Coldplay | Speed Of Sound | Pop_Contemporary | Pop_Rock |
5509 | TROAQBZ128F9326213 | SONYKOW12AB01849C9 | OneRepublic | Secrets | Rock_Contemporary | Pop_Rock |
4414 | TRVSBTV12903CC6670 | SOLFXKT12AB017E3E0 | Charttraxx Karaoke | Fireflies | NaN | NaN |
8567 | TRIEXMF128F92FDD60 | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | Pop_Rock |
8567 | TRTEHXL128F931687B | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | NaN |
7873 | TRXWAZC128F9314B3E | SOUFPNI12A8C142D19 | John Mayer | Heartbreak Warfare | Rock_Contemporary | Pop_Rock |
4254 | TRCBRTN12903CC4BD1 | SOKUPAO12AB018D576 | Paramore | The Only Exception (Album Version) | NaN | NaN |
355 | TRHKJNX12903CEFCDF | SOAXGDH12A8C13F8A1 | Florence + The Machine | Dog Days Are Over (Radio Edit) | NaN | NaN |
metadata.loc[query_tfidf('Welcome To The Black Parade (Album Version)')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
6562 | TRKJHNE128F42380FC | SOQQAAQ12A67ADE34D | My Chemical Romance | Welcome To The Black Parade (Album Version) | Grunge_Emo | Pop_Rock |
6078 | TRRJZWL128F146D790 | SOPKPSQ12A58A7A5E4 | My Chemical Romance | I'm Not Okay (I Promise) (Live From Sessions@AOL) | Grunge_Emo | Pop_Rock |
4714 | TRYZMOC128F423E58D | SOLXXZI12A8AE4733A | My Chemical Romance | Helena (So Long & Goodnight) (Album Version) | Grunge_Emo | Pop_Rock |
2583 | TRHWCVS128F14895A3 | SOGOZLT12A6D4FB302 | My Chemical Romance | Teenagers (Album Version) | Grunge_Emo | Pop_Rock |
5882 | TRMAFWC128F423E58F | SOOXMSN12A58A7A8D3 | My Chemical Romance | To The End (Album Version) | Grunge_Emo | Pop_Rock |
9415 | TRSJBHU128F1496F3E | SOYIOZB12A58A797FC | Fall Out Boy | This Ain't A Scene_ It's An Arms Race | NaN | NaN |
7303 | TRQFMAE128F92FA8F2 | SOSPING12A58A7B4FF | Lou Reed & John Cale | I Believe (LP Version) | NaN | NaN |
3796 | TRZIQMY128F146E70D | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
3796 | TRJQKHL128F9304295 | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
3796 | TRYMVOW128F92ECE27 | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
8396 | TRIAOSK128E0785308 | SOVPSKL12A670206B9 | Jaci Velasquez | Something (Album Version) | Pop_Latin | Religious |
5711 | TRLOMKL128F4295BDA | SOOLYZQ12A6D4FA5B7 | The Red Jumpsuit Apparatus | Face Down (Album Version) | NaN | NaN |
7091 | TRQUXPB128F147C1DA | SOSCGFN12AF729B231 | My Chemical Romance | Vampires Will Never Hurt You | NaN | NaN |
그야말로 총체적 난국. 처음의 문제가 그닥 완화되지 않은 채 계속 발생하고, 그로 인해 추천 퀄리티가 자카드 지수 때보다 오히려 안 좋아졌습니다.
이번에는 SVD를 이용해서 희소 행렬에 숨어있는 취향의 구조를 찾아내 봅시다.
from sklearn.decomposition import TruncatedSVD
def svd(matrix, K):
transformer = TruncatedSVD(n_components=K, n_iter=10)
return transformer.fit_transform(matrix)
play_count_lsa = svd(play_count_tfidf, K=50)
play_count_lsa.shape
(10000, 50)
def cosine_similarity_dense(v1, v2):
return v1.dot(v2.transpose()) / np.sqrt(v1.dot(v1.transpose())) / np.sqrt(v2.dot(v2.transpose()))
역시 이런 벡터의 경우에도 Nearest Neighbor를 쓰는게 일반적이지만, 50차원에 1만 곡 밖에 안 되니까 그냥.
def query_lsa(song_name, return_k=10):
idx = metadata[metadata.title == song_name].song.cat.codes.values[0]
target = play_count_lsa[idx]
sim = np.apply_along_axis(lambda v: cosine_similarity_dense(target, v), 1, play_count_lsa)
cand = sim.argsort()[::-1][:return_k+1]
cand = list(cand[cand != idx][:return_k])
return [idx] + cand
metadata.loc[query_lsa('You Belong With Me')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
7330 | TRJPXVB128F9316916 | SOSROFB12AAF3B4C5D | Taylor Swift | You Belong With Me | Country_Traditional | Country |
977 | TRGMZNT128F92DE267 | SOCLMAD12AB017FC09 | Taylor Swift | Tim McGraw | Country_Traditional | Country |
7761 | TRSDRPY128F933E202 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | NaN | NaN |
7761 | TROPUMP128F92EC162 | SOTWSXL12A8C143349 | Taylor Swift | Love Story | Country_Traditional | Country |
747 | TRIUFKY12903CE6EA6 | SOBWSGV12AB018B5E0 | Selena Gomez & The Scene | Naturally | NaN | NaN |
2195 | TRBNYBX128F422EC61 | SOFRCGW12A81C21EA6 | Plain White T's | Hey There Delilah | NaN | NaN |
4968 | TRHCDSY128F931692E | SOMPTCI12AB017C416 | Taylor Swift | Forever & Always | Country_Traditional | Country |
3418 | TRBTNPR12903D13765 | SOISNSU12AC468C0D8 | Adam Lambert | If I Had You | NaN | NaN |
5241 | TRFSCMH128F425CB85 | SONHLJN12A81C2169B | Mickie Krause | Orange Trägt Nur Die Müllabfuhr (Go West) | Pop_Contemporary | Pop_Rock |
6421 | TRKQRVO128F92D630B | SOQGJZA12A8C1367AE | Katy Perry | Thinking Of You | NaN | NaN |
1227 | TRFGWOW12903CF14C3 | SODCADR12AF72A1A99 | Adam Lambert | Whataya Want From Me | NaN | Pop_Rock |
8105 | TRVEKLH128F92DE271 | SOUVGJL12AB017FC35 | Taylor Swift | Our Song | Country_Traditional | Country |
metadata.loc[query_lsa('Fix You')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
8597 | TRYVBMA128E0789D39 | SOWEJXA12A6701C574 | Coldplay | Fix You | NaN | NaN |
4114 | TRQFXKD128E0780CAE | SOKLRPJ12A8C13C3FE | Coldplay | The Scientist | Pop_Contemporary | Pop_Rock |
1125 | TRENTGL128E0780C8E | SOCVTLJ12A6310F0FD | Coldplay | Clocks | Rock_College | Pop_Rock |
6284 | TRIKGRK128E0780DB0 | SOPXKYD12A6D4FA876 | Coldplay | Yellow | Pop_Contemporary | Pop_Rock |
6284 | TRTZNQZ12903CD044C | SOPXKYD12A6D4FA876 | Coldplay | Yellow | NaN | NaN |
7944 | TRYNYSX128E07897B3 | SOUKJBT12A6701C4D6 | Coldplay | Speed Of Sound | Pop_Contemporary | Pop_Rock |
4486 | TRNGAAK128F147DF92 | SOLJWIQ12A6D4FA875 | Coldplay | Sparks | Pop_Contemporary | Pop_Rock |
8567 | TRIEXMF128F92FDD60 | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | Pop_Rock |
8567 | TRTEHXL128F931687B | SOWCKVR12A8C142411 | Kings Of Leon | Use Somebody | NaN | NaN |
5509 | TROAQBZ128F9326213 | SONYKOW12AB01849C9 | OneRepublic | Secrets | Rock_Contemporary | Pop_Rock |
9176 | TRAALAH128E078234A | SOXQYSC12A6310E908 | The Verve | Bitter Sweet Symphony | Pop_Contemporary | Pop_Rock |
4695 | TRKBJPL12903CED195 | SOLWZVR12AB01849C6 | OneRepublic | All The Right Moves | Rock_Contemporary | Pop_Rock |
6176 | TROTIUH128E0782538 | SOPQLBY12A6310E992 | Radiohead | Creep (Explicit) | Pop_Contemporary | Pop_Rock |
metadata.loc[query_lsa('Welcome To The Black Parade (Album Version)')]
track | song | artist | title | style | genre | |
---|---|---|---|---|---|---|
code | ||||||
6562 | TRKJHNE128F42380FC | SOQQAAQ12A67ADE34D | My Chemical Romance | Welcome To The Black Parade (Album Version) | Grunge_Emo | Pop_Rock |
2583 | TRHWCVS128F14895A3 | SOGOZLT12A6D4FB302 | My Chemical Romance | Teenagers (Album Version) | Grunge_Emo | Pop_Rock |
3796 | TRZIQMY128F146E70D | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
3796 | TRJQKHL128F9304295 | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
3796 | TRYMVOW128F92ECE27 | SOJRFWQ12AB0183582 | Fall Out Boy | Dance_ Dance | NaN | NaN |
6078 | TRRJZWL128F146D790 | SOPKPSQ12A58A7A5E4 | My Chemical Romance | I'm Not Okay (I Promise) (Live From Sessions@AOL) | Grunge_Emo | Pop_Rock |
9595 | TRUNAXR128F93119C6 | SOYWRLV12AB0186090 | Panic At The Disco | I Write Sins Not Tragedies [Live In Chicago] | NaN | NaN |
8774 | TRWLOCY128F9338118 | SOWPLVJ12AB0183586 | Fall Out Boy | Sugar_ We're Goin Down | NaN | NaN |
8774 | TRSSSJW128F146D5DC | SOWPLVJ12AB0183586 | Fall Out Boy | Sugar_ We're Goin Down | NaN | NaN |
9415 | TRSJBHU128F1496F3E | SOYIOZB12A58A797FC | Fall Out Boy | This Ain't A Scene_ It's An Arms Race | NaN | NaN |
8396 | TRIAOSK128E0785308 | SOVPSKL12A670206B9 | Jaci Velasquez | Something (Album Version) | Pop_Latin | Religious |
4714 | TRYZMOC128F423E58D | SOLXXZI12A8AE4733A | My Chemical Romance | Helena (So Long & Goodnight) (Album Version) | Grunge_Emo | Pop_Rock |
4480 | TRXVBBR128F92E26C8 | SOLJPFF12A8C133221 | Panic At The Disco | Nine In The Afternoon (Album Version) | Grunge_Emo | Pop_Rock |
9935 | TRQHUZE128F42385E8 | SOZVMYF12A8C132646 | Fall Out Boy | Beat It | NaN | NaN |
자카드 지수나 TF-IDF와 비슷한 것 같지만, <You Belong To Me>의 경우에는 케이티 페리나 셀레나 고메즈가 나온걸 보아 많이 좋아진 것을 확인할 수 있습니다!! 다만, 여전히 나머지 두 곡의 결과는 골때리는 문제를 보여주고 있습니다. 다음 포스팅과 노트북에서는 이런 부분을 해결할 수 있는 다른 방법에 대해 알아봅시다.