import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Reading movies file
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])
movies.head()
title | genres | |
---|---|---|
0 | Toy Story | Adventure|Animation|Children|Comedy|Fantasy |
1 | Jumanji | Adventure|Children|Fantasy |
2 | Grumpier Old Men | Comedy|Romance |
3 | Waiting to Exhale | Comedy|Drama|Romance |
4 | Father of the Bride Part II | Comedy |
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape
(9742, 177)
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]
array([[1. , 0.31379419, 0.0611029 , 0.05271111], [0.31379419, 1. , 0. , 0. ], [0.0611029 , 0. , 1. , 0.35172407], [0.05271111, 0. , 0.35172407, 1. ]])
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
idx = indices[title]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:21]
movie_indices = [i[0] for i in sim_scores]
return titles.iloc[movie_indices]
genre_recommendations('Dark Knight ').head(20)
8387 Need for Speed 8149 Grandmaster, The (Yi dai zong shi) 123 Apollo 13 8026 Life of Pi 8396 Noah 38 Dead Presidents 341 Bad Company 347 Faster Pussycat! Kill! Kill! 430 Menace II Society 568 Substitute, The 665 Nothing to Lose 1645 Untouchables, The 1696 Monument Ave. 2563 Death Wish 2574 Band of the Hand 3037 Foxy Brown 3124 Harley Davidson and the Marlboro Man 3167 Scarface 3217 Swordfish 3301 Above the Law Name: title, dtype: object
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['title'])
tfidf_matrix.shape
(9742, 20558)
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]
array([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.]])
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
# Function that get movie recommendations based on the cosine similarity score of movie genres
def title_recommendations(title):
idx = indices[title]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:21]
movie_indices = [i[0] for i in sim_scores]
return titles.iloc[movie_indices]
title_recommendations('Dark Knight ').head(20)
7768 Dark Knight Rises, The 8032 Batman: The Dark Knight Returns, Part 1 8080 Batman: The Dark Knight Returns, Part 2 140 First Knight 2417 Cry in the Dark, A 5778 Alone in the Dark 7375 Knight and Day 3576 Black Knight 3190 Knight's Tale, A 6858 Alone in the Dark II 4242 Dark Blue 5060 Dark Days 1305 Dark City 5483 Dark Star 6815 Batman: Gotham Knight 5934 Dark Water 4749 Shot in the Dark, A 7877 Dark Shadows 8766 The Dark Valley 6690 Taxi to the Dark Side Name: title, dtype: object