Importing Libraries and Loading Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
# Reading movies file
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])
In [3]:
movies.head()
Out[3]:
title genres
0 Toy Story Adventure|Animation|Children|Comedy|Fantasy
1 Jumanji Adventure|Children|Fantasy
2 Grumpier Old Men Comedy|Romance
3 Waiting to Exhale Comedy|Drama|Romance
4 Father of the Bride Part II Comedy
In [4]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

Recommendation based on Genre

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape
Out[5]:
(9742, 177)
In [6]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]
Out[6]:
array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])
In [7]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]
In [8]:
genre_recommendations('Dark Knight ').head(20)
Out[8]:
8387                          Need for Speed 
8149      Grandmaster, The (Yi dai zong shi) 
123                                Apollo 13 
8026                              Life of Pi 
8396                                    Noah 
38                           Dead Presidents 
341                              Bad Company 
347             Faster Pussycat! Kill! Kill! 
430                        Menace II Society 
568                          Substitute, The 
665                          Nothing to Lose 
1645                       Untouchables, The 
1696                           Monument Ave. 
2563                              Death Wish 
2574                        Band of the Hand 
3037                              Foxy Brown 
3124    Harley Davidson and the Marlboro Man 
3167                                Scarface 
3217                               Swordfish 
3301                           Above the Law 
Name: title, dtype: object

Recommendation based on Title

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['title'])
tfidf_matrix.shape
Out[9]:
(9742, 20558)
In [10]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]
Out[10]:
array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])
In [11]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def title_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]
In [12]:
title_recommendations('Dark Knight ').head(20)
Out[12]:
7768                     Dark Knight Rises, The 
8032    Batman: The Dark Knight Returns, Part 1 
8080    Batman: The Dark Knight Returns, Part 2 
140                                First Knight 
2417                         Cry in the Dark, A 
5778                          Alone in the Dark 
7375                             Knight and Day 
3576                               Black Knight 
3190                           Knight's Tale, A 
6858                       Alone in the Dark II 
4242                                  Dark Blue 
5060                                  Dark Days 
1305                                  Dark City 
5483                                  Dark Star 
6815                      Batman: Gotham Knight 
5934                                 Dark Water 
4749                        Shot in the Dark, A 
7877                               Dark Shadows 
8766                            The Dark Valley 
6690                      Taxi to the Dark Side 
Name: title, dtype: object