#!/usr/bin/env python # coding: utf-8 # # Understanding Movies Using LSA # In[1]: import numpy as np import pandas as pd import math import random import seaborn as sns import matplotlib.pyplot as plt from matplotlib import colors from sklearn.decomposition import NMF from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.metrics.pairwise import cosine_similarity get_ipython().run_line_magic('', 'matplotlib inline') # ## Loading the Data # # I've left the data in single files for each year, so that an enthusiastic person of the future could add extra years' data easily. # In[2]: dfs = [] for year in range(1940, 2018): dfs.append(pd.read_csv('scraped_movies/top_movies_of_%d.csv' % year, encoding = 'cp1252')) movie_data = pd.concat(dfs) # In[3]: dfs = [] for year in range(1940, 2018): dfs.append(pd.read_csv('scraped_movies/keywords_for_top_movies_of_%d.csv' % year, encoding = 'cp1252')) keywords = pd.concat(dfs) # In[4]: movie_data.index = range(len(movie_data)) keywords.index = range(len(keywords)) # In[5]: movie_data.head() # In[6]: keywords.head() # We can see there's some movies missing keywords, and probably a bunch of missing data elsewhere too. I'm basically ignoring that at the moment. # ## Lookup objects # # For convenience, I'm defining here a bunch of lookups I'll later use to filter the datasets - they're mostly boolean series, or dicts. # In[7]: marvel_lookup = keywords.keywords.fillna('').str.contains('marvel-cinematic-universe') # In[8]: title_lookup = pd.Series(movie_data.title) title_lookup.index = movie_data.IMDbId title_lookup = title_lookup.to_dict() # In[9]: furiouses = ['The Fast and the Furious (2001)', '2 Fast 2 Furious (2003)', 'The Fast and the Furious: Tokyo Drift (2006)', 'Fast & Furious (2009)', 'Fast Five (2011)', 'Furious 6 (2013)', 'Furious Seven (2015)', 'The Fate of the Furious (2017)'] furious_lookup = keywords.IMDbId.map(title_lookup).isin(furiouses) # In[10]: aliens = ['Alien (1979)', 'Aliens (1986)', 'AlienĀ³ (1992)', 'Alien Resurrection (1997)', 'AVP: Alien vs. Predator (2004)', 'Prometheus (I) (2012)', 'Alien: Covenant (2017)' ] aliens_lookup = keywords.IMDbId.map(title_lookup).isin(aliens) # In[11]: princess_lookup = keywords.keywords.fillna('').str.contains('disney-princess') # In[12]: jaws_lookup = keywords.IMDbId.map(title_lookup).apply(lambda x: 'jaws' in x.lower()) # In[13]: decade_lookup = pd.DataFrame(movie_data.release_year.apply(lambda x: math.floor(x/10)*10)) decade_lookup.index = movie_data.IMDbId decade_lookup = decade_lookup.to_dict()['release_year'] # In[14]: chocula = CountVectorizer(tokenizer = lambda x: x.split(', ')) genres = chocula.fit_transform(movie_data.genre_list.fillna('xxx')).toarray() genre_lookup = pd.DataFrame(genres, columns = chocula.get_feature_names()) genre_lookup.index = movie_data.IMDbId # In[15]: rank_lookup = pd.Series(movie_data.box_office_rank) rank_lookup.index = movie_data.IMDbId rank_lookup = rank_lookup.to_dict() # ## Process Example # # I'm running through the whole transformation with a sample of the data, to demonstrate what it looks like at each stage. # # We're transforming our list of movies and keywords into a list of movies and a set of columns storing an abstract numeric represenation of that movie, derived from its similarity to other movies, in terms of their shared keywords. The steps are: # # - "Bag of words" encoding - making a table with a column for each keyword # - "TFIDF" weighting - weighting each movie's keyword entries to account for movies with more or fewer total keywords # - Taking the "Dot Product" - Creating a matrix comparing each movie to each other movie, in terms of keyword similarity. # - Applying "Dimensionality Reduction" - reducing the number of columns in the matrix from one per movie to a smaller number of abstract values. # In[16]: sample = keywords[(keywords.IMDbId.map(rank_lookup) < 10)& # Make sure they're movies someone has heard of (keywords.IMDbId.map(title_lookup).apply(len) < 25) # Make sure the title's aren't too long to display nicely ].sample(10) pd.DataFrame([sample.IMDbId.map(title_lookup), sample.keywords.apply(lambda x: ', '.join(str(x).split('|')))]).transpose() # ### Bag of Words # Creating a "Countvectoriser" object, and a "TfidifTransformer", which will do bag-of-words encoding and TFIDF weighting, respectively. # In[17]: vlad = CountVectorizer(tokenizer = lambda x: x.split('|'), min_df = 0) megatron = TfidfTransformer() sparse = vlad.fit_transform(pd.Series(keywords.keywords.fillna('').values)) sample_sparse = vlad.fit_transform(pd.Series(sample.keywords.fillna('').values)) # Choose some random columns to show # In[18]: ns = random.sample(range(len(vlad.get_feature_names())), 5) # The sample dataset has been "bag of words" encoded - we've created a column for each keyword, and each movie gets a flag for whether it has that keyword or not. # In[19]: pd.DataFrame(sample_sparse.toarray()[:,ns], index=sample.IMDbId.map(title_lookup), columns=[vlad.get_feature_names()[i] for i in ns]) # ### TFIDF Weighting # # Now we apply the TFIDF weighting. # In[20]: weighted = megatron.fit_transform(sample_sparse) pd.DataFrame(weighted.toarray()[:,ns], index=sample.IMDbId.map(title_lookup), columns=[vlad.get_feature_names()[i] for i in ns]).apply(round, args=(2,)) # ### Dot Product # # Now we take the "Dot product" of the weighted values - every movie's correlation with every other movie, in terms of keywords in common. # In[21]: # Cheeky wee method for shading the dataframe nicely. Pretty sure I nicked this offa StackOverflow. def background_gradient(s, m, M, cmap='PuBu', low=0, high=0): rng = M - m norm = colors.Normalize(m - (rng * low), M + (rng * high)) normed = norm(s.values) c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)] return ['background-color: %s' % color for color in c] # In[22]: weighted = megatron.fit_transform(sparse) the_matrix = weighted.dot(weighted.T) the_matrix = the_matrix[:,sample.index][sample.index,:] dot = pd.DataFrame(the_matrix.toarray(), index=sample.IMDbId.map(title_lookup), columns=sample.IMDbId.map(title_lookup)).apply(round, args=(2,)) dot.style.apply(background_gradient, cmap=sns.light_palette("grey", as_cmap=True), m=dot.min().min(), M=0.05, low=0) # ### Dimensionality Reduction # # Finally, we apply dimensionality reduction. I'm using "Non-negative matrix factorization", because the similarity calculations I'm doing later - cosine similarity - get weird if there are negative values. I'm not convinced this is the right approach though. # In[23]: shrinky = NMF(n_components = 2) shrunk_sample = shrinky.fit_transform(the_matrix.toarray()) # In[24]: reduced = pd.DataFrame(shrunk_sample, index=sample.IMDbId.map(title_lookup)).apply(round, args=(2,)) reduced.style.apply(background_gradient, cmap=sns.light_palette("grey", as_cmap=True), m=reduced.min().min(), M=reduced.max().max(), low=0) # ## Apply to the Whole Dataset # In[94]: # Throwing the whole process into a little method def make_matrix(df, countvectoriser, tfidf): sparse = countvectoriser.fit_transform(pd.Series(df.keywords.fillna('').values)) weighted = tfidf.fit_transform(sparse) matrix = weighted.dot(weighted.T) movies = pd.Series(countvectoriser.get_feature_names()) return matrix, movies, weighted # In[95]: vlad_ = CountVectorizer(tokenizer = lambda x: x.split('|'), min_df = 10) megatron_ = TfidfTransformer() matrix, words, weighted = make_matrix(keywords, vlad_, megatron_) # In[96]: target = matrix[10].toarray() # In[97]: vector = megatron_.transform(vlad_.transform([keywords.loc[10].keywords])) # In[98]: vector # In[107]: new = weighted.dot(vector.T).toarray().T # I'm keeping the dimensionality reduction seperate, 'cause it's SLOW. # In[26]: shrinky = NMF(n_components = 100) shrunk_100 = shrinky.fit_transform(matrix.toarray()) # Visualising the sample from above, but with the full set of columns. # In[27]: reduced = pd.DataFrame(shrunk_100[sample.index,:15], index=sample.IMDbId.map(title_lookup)).apply(round, args=(2,)) reduced['...'] = pd.Series(['']*10, index = sample.IMDbId.map(title_lookup)) reduced.style.apply(background_gradient, subset=pd.IndexSlice[:,range(0, 15)], cmap=sns.light_palette("grey", as_cmap=True), m=reduced[[i for i in range(0,10)]].min().min(), M=reduced[[i for i in range(0,10)]].max().max(), ) # ## Doing some Movie Maths! # In[80]: movie_one = list(keywords.IMDbId.map(title_lookup)).index("Universal Soldier (1992)") # In[81]: movie_two = list(keywords.IMDbId.map(title_lookup)).index("Piranha (1978)") # In[82]: avg_movie = shrunk_100.mean(axis=0).reshape(1, -1) # In[83]: targets = [movie_one, movie_two] #, 4225] # In[84]: target = shrunk_100[movie_one].reshape(1, -1) + shrunk_100[movie_two].reshape(1, -1) # In[85]: best_list = [i for i in np.argsort(cosine_similarity(target, shrunk_100))[0][::-1] if i not in targets][:10] [keywords.IMDbId.map(title_lookup)[i] for i in best_list] # ## Plotting movies # In[276]: plotting_matrix = matrix similar_matrix = plotting_matrix[aliens_lookup[aliens_lookup == True].index] axis_1_title = 'Bambi (1942)' axis_2_title = "Showgirls (1995)" axis_1_movie = list(keywords.IMDbId.map(title_lookup)).index(axis_1_title) axis_2_movie = list(keywords.IMDbId.map(title_lookup)).index(axis_2_title) # In[277]: titles = keywords[aliens_lookup].IMDbId.map(title_lookup) # In[119]: titles # In[279]: axis_1 = cosine_similarity(plotting_matrix[axis_1_movie], similar_matrix)[0] axis_2 = cosine_similarity(plotting_matrix[axis_2_movie], similar_matrix)[0] # In[280]: # An extremely janky method for stopping the titles from overlapping each other. def avoid_overlap(axis_1, axis_2, x_tolerance = 0.05, y_tolerance = 0.02, increment = 0.01): fixed = [] for x, y in zip(axis_1, axis_2): Xs = pd.Series([i[0] for i in fixed]) Ys = pd.Series([i[1] for i in fixed]) while ((Xs < x+x_tolerance) & (Xs > x-x_tolerance) & (Ys < y+y_tolerance) & (Ys > y-y_tolerance)).any(): y += y_tolerance fixed.append((x, y)) return fixed # In[287]: pd.DataFrame(list(zip(axis_1, axis_2))).plot(kind='scatter', x=0, y=1, c='w') # Cause I'm lazy, you gotta fiddle with the values to get the titles to show up in the right spots. for label, (x, y) in zip(list(titles.values), avoid_overlap(axis_1, axis_2, y_tolerance = 0.0006, increment = 0.00002, x_tolerance = 1)): label = label[:label.find('(')-1] plt.annotate(label, fontsize=10, fontname='Garamond', xy=(x - (len(label)*0.0007), y + 0.0025)) plt.xlabel(axis_1_title, fontname='Garamond', fontsize = 14) plt.ylabel(axis_2_title, fontname='Garamond', fontsize = 14) # plt.xlim(0.55, 0.78) plt.show() # In[240]: two_shrunk_two_furious = NMF(n_components = 4) shrunk_2 = two_shrunk_two_furious.fit_transform(matrix[aliens_lookup[aliens_lookup == True].index].toarray()) alien_df = pd.DataFrame(shrunk_2) alien_df.plot(kind='scatter', x=0, y=1, c='w') for label, x, y in zip(list(titles.values), alien_df[0], alien_df[1]): label = label[:label.find('(')-1] plt.annotate(label, fontsize=10, fontname='Garamond', xy =(x - (len(label)*0.007), y + 0.06)) plt.show() # In[241]: alien_df.index = titles alien_df.index.name = 'Title' alien_df.style.apply(background_gradient, subset=pd.IndexSlice[:,range(0, 5)], cmap=sns.light_palette("grey", as_cmap=True), m=0.05, M=2, ) # In[214]: [(label, x, y) for label, x, y in zip(list(titles.values), shrunk_2[0], shrunk_2[1])] # In[ ]: