All imports.

In [20]:
%matplotlib inline

from __future__ import division

import logging, sys, random
from time import time

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.metrics import pairwise_distances

import numpy as np
from scipy.stats import mode
import matplotlib.pyplot as plt, mpld3
from mpld3 import plugins
# mpld3.enable_notebook()

def find_movie(os_id):
    return filter(lambda movie: movie['osID'] == str(os_id), movies)

def make_histogram(innerDict):
    x = np.arange(len(innerDict.keys()))
    y = innerDict.values()
    fig = plt.figure(figsize=(20,10))
    ax = fig.add_subplot(1,1,1)
    ax.bar(x, y)
    ax.set_xticks(x)
    ax.set_xticklabels(innerDict.keys(), rotation=70)
    plt.show()

Load Movie Data

In [21]:
from load import movies
movies = np.array(movies)
print "loaded data", len(movies)

genres = set()
for movie in movies:
    for genre in movie.get('Genre', []):
        genres.add(genre)
genres = list(genres)

print genres
loaded data 6253
[u'Sci-Fi', u'Crime', u'Romance', u'Animation', u'Music', u'Adult', u'Comedy', u'War', u'Horror', u'Film-Noir', u'Adventure', u'News', u'Thriller', u'Western', u'Mystery', u'Short', u'N/A', u'Drama', u'Action', u'Documentary', u'Musical', u'History', u'Family', u'Fantasy', u'Sport', u'Biography']
In [22]:
N_CLUSTERS = 5
reduce_dimensionality = True
k_means = KMeans(n_clusters=N_CLUSTERS, init='k-means++', max_iter=100, n_init=1, verbose=True)
vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.1, stop_words='english')
lsa = TruncatedSVD(2)
In [23]:
text = [movie['script'] for movie in movies]
vectors = vectorizer.fit_transform(text)
if reduce_dimensionality == True:
    X = lsa.fit_transform(vectors)
else:
    X = vectors
In [24]:
km = k_means.fit(X)
Initialization complete
Iteration  0, inertia 55.010
Iteration  1, inertia 43.620
Iteration  2, inertia 42.596
Iteration  3, inertia 42.334
Iteration  4, inertia 42.189
Iteration  5, inertia 42.068
Iteration  6, inertia 41.976
Iteration  7, inertia 41.896
Iteration  8, inertia 41.823
Iteration  9, inertia 41.768
Iteration 10, inertia 41.717
Iteration 11, inertia 41.675
Iteration 12, inertia 41.617
Iteration 13, inertia 41.565
Iteration 14, inertia 41.545
Iteration 15, inertia 41.532
Iteration 16, inertia 41.520
Iteration 17, inertia 41.507
Iteration 18, inertia 41.494
Iteration 19, inertia 41.478
Iteration 20, inertia 41.458
Iteration 21, inertia 41.430
Iteration 22, inertia 41.384
Iteration 23, inertia 41.309
Iteration 24, inertia 41.166
Iteration 25, inertia 40.922
Iteration 26, inertia 40.615
Iteration 27, inertia 40.323
Iteration 28, inertia 40.076
Iteration 29, inertia 39.915
Iteration 30, inertia 39.809
Iteration 31, inertia 39.731
Iteration 32, inertia 39.676
Iteration 33, inertia 39.649
Iteration 34, inertia 39.635
Iteration 35, inertia 39.624
Iteration 36, inertia 39.611
Iteration 37, inertia 39.601
Iteration 38, inertia 39.593
Iteration 39, inertia 39.586
Iteration 40, inertia 39.579
Iteration 41, inertia 39.571
Iteration 42, inertia 39.562
Iteration 43, inertia 39.558
Converged at iteration 43
In [25]:
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
terms = vectorizer.get_feature_names()
In [26]:
if reduce_dimensionality == False:
    order_centroids = k_means_cluster_centers.argsort()[:, ::-1]
    for i in range(N_CLUSTERS):
        print [(terms[ind], k_means_cluster_centers[i][ind]) for ind in order_centroids[i, :100]]
        print ""
In [27]:
for k in range(N_CLUSTERS):
    z = vectors.toarray()[k_means_labels == k]
    wordz_tfidf = [(terms[i], z[:,i].sum()) for i in range(z.shape[1])]
    wordz_tfidf = sorted(wordz_tfidf, key=lambda x: x[1], reverse=True )
    print wordz_tfidf[:100]
    print ""
[(u'lord', 98.166686329454137), (u'master', 87.992903421941833), (u'king', 86.417228141490284), (u'brother', 62.734085101417911), (u'doctor', 49.950350683161602), (u'child', 49.6539755022592), (u'daughter', 49.474835442101373), (u'madame', 47.464176046913664), (u'war', 46.057368458975986), (u'sister', 44.063794614458814), (u'children', 42.38001294540318), (u'captain', 39.291449633931492), (u'prince', 39.240420784714772), (u'return', 38.619742058993552), (u'professor', 38.350708392844808), (u'forgive', 38.159126004400107), (u'general', 37.7188227993448), (u'il', 37.623688566480851), (u'sword', 36.903735481964667), (u'queen', 36.345847558836383), (u'madam', 36.255053534978728), (u'blood', 33.76066425438578), (u'fight', 33.3239474662251), (u'uncle', 33.323042316150634), (u'peace', 33.282213189053202), (u'village', 33.208947462947542), (u'marry', 33.20434950606014), (u'alright', 32.652671516456877), (u'earth', 32.637549376388932), (u'paris', 30.971632311217952), (u'fear', 30.92269903018942), (u'power', 29.125756022444445), (u'army', 28.605320252379926), (u'body', 28.481515611053673), (u'count', 28.406224776401356), (u'soul', 28.344065656038676), (u'lf', 28.293534797196919), (u'land', 27.83488359055578), (u'letter', 27.785725545527111), (u'ah', 27.758717731719067), (u'law', 27.612010659391149), (u'died', 27.566598467770852), (u'alive', 26.852529536084322), (u'palace', 26.851780814156832), (u'strange', 26.790929694884724), (u'priest', 26.477122567518663), (u'evil', 26.392173123375581), (u'dr', 26.346973378536035), (u'gentlemen', 26.150341940388738), (u'france', 26.107387943190794), (u'wine', 26.018223569640398), (u'quickly', 25.82244126990868), (u'words', 25.7726867772963), (u'goodbye', 25.546196257918091), (u'soldiers', 25.527329329179508), (u'attack', 24.947663650148762), (u'dare', 24.935881022114845), (u'enemy', 24.836143721218807), (u'city', 24.834608676988516), (u'marriage', 24.688555927470802), (u'heaven', 24.57086985297893), (u'court', 24.468604506328376), (u'police', 24.11540750468928), (u'devil', 24.071194773297961), (u'fool', 23.995063883930818), (u'anymore', 23.99308382479753), (u'music', 23.762656050431236), (u'holy', 23.493135567424027), (u'beg', 23.484087983976703), (u'follow', 23.471963832088388), (u'escape', 23.442706393194733), (u'accept', 23.370182329324038), (u'human', 23.20604140774104), (u'sea', 23.160864646419746), (u'welcome', 23.090340126613491), (u'gold', 22.920189007017001), (u'happiness', 22.798702274633872), (u'secret', 22.792581890442079), (u'mercy', 22.583039183886502), (u'ship', 22.386948742093384), (u'pray', 22.184788989026824), (u'loved', 22.099068383797295), (u'promise', 21.930702104541712), (u'person', 21.85233975894058), (u'german', 21.761274492439952), (u'ok', 21.684835985034006), (u'news', 21.667074453876083), (u'hell', 21.544458466120567), (u'sing', 21.530996830325314), (u'horse', 21.503560465630187), (u'dream', 21.364926666055268), (u'given', 21.33903145329981), (u'possible', 21.312688931764097), (u'sun', 21.274245624786445), (u'sake', 21.250641255285409), (u'calm', 21.247391490715962), (u'honor', 21.194368455826822), (u'write', 21.124735267073085), (u'act', 21.122372850815992), (u'school', 21.043781410262199)]

[(u'gonna', 161.84550229849327), (u'ain', 122.7619020961188), (u'uh', 83.598679243389427), (u'okay', 80.136981782766185), (u'em', 59.880793571960204), (u'huh', 57.938751894394258), (u'gotta', 57.032350278322653), (u'wanna', 55.926115780950745), (u'ya', 46.129738794174514), (u'guy', 45.91604319976009), (u'kid', 44.684743957504992), (u'boys', 39.026440477154708), (u'joe', 37.979969181035287), (u'honey', 36.726462822180011), (u'mrs', 36.340216863791731), (u'town', 36.326354987141031), (u'ma', 33.663398225457605), (u'baby', 32.770838557724588), (u'hi', 29.74996928059312), (u'somebody', 29.008058644905184), (u'gun', 28.647541889049393), (u'guys', 27.085612837578871), (u'car', 26.095637084381885), (u'anybody', 22.792936002866796), (u'doc', 22.771985122440192), (u'hmm', 22.650167502634169), (u'horse', 22.454534770834975), (u'bet', 21.269709223533013), (u'folks', 21.206494183888083), (u'couple', 20.898596364267664), (u'bye', 20.33367139426646), (u'ha', 20.159212865178439), (u'ought', 19.445082909716177), (u'suppose', 19.197442375353539), (u'darling', 19.095881143540083), (u'charlie', 18.838439582134384), (u'hit', 18.720820848150776), (u'dad', 18.705058828353401), (u'gee', 18.664376373144975), (u'frank', 18.429420896789871), (u'kids', 18.406032104856504), (u'till', 18.329340293364286), (u'captain', 18.085648176418577), (u'stuff', 18.045284465553738), (u'000', 18.043093917950568), (u'shot', 17.424274360823897), (u'mm', 17.375372908921364), (u'pick', 17.191526391961574), (u'jack', 17.068030899755925), (u'police', 16.797409860944356), (u'ride', 16.785360962237501), (u'fight', 16.715653171826069), (u'swell', 16.668687757828582), (u'phone', 16.513659668871512), (u'00', 16.490572669217038), (u'funny', 16.482091381671257), (u'bank', 16.305875714539908), (u'10', 16.261802879042634), (u'pop', 16.204671623198482), (u'girls', 16.18605499313384), (u'george', 16.184555293693094), (u'york', 16.18244193879654), (u'week', 16.077636488089468), (u'brother', 15.993776775122305), (u'fellas', 15.923736683215134), (u'office', 15.914925692189392), (u'doctor', 15.676298607845636), (u'fella', 15.674513712535814), (u'boss', 15.600815301906838), (u'dollars', 15.492344446117933), (u'shoot', 15.436952679270158), (u'figure', 15.36609997815337), (u'wonderful', 15.338803580776451), (u'gentlemen', 15.264614314158976), (u'number', 15.093516418709395), (u'john', 15.062816054670659), (u'lieutenant', 14.830829390491669), (u'school', 14.669446031570736), (u'mom', 14.623329754686853), (u'coffee', 14.605980386424294), (u'mama', 14.526322039485297), (u'train', 14.494903118890932), (u'tough', 14.458855781278725), (u'deal', 14.456313756087173), (u'city', 14.426503852492708), (u'party', 14.400646653180232), (u'bucks', 14.361534086805896), (u'hell', 13.98803613120468), (u'street', 13.970775936209744), (u'hot', 13.893859173273142), (u'ah', 13.705145995123925), (u'beat', 13.700710169084614), (u'picture', 13.477086622564725), (u'game', 13.472003757571109), (u'certainly', 13.451912016993619), (u'law', 13.451297071976203), (u'dog', 13.425250262649492), (u'boat', 13.375937756829597), (u'check', 13.323123345055546), (u'line', 13.268444468671213)]

[(u'mrs', 80.778055109660713), (u'doctor', 78.751160245974404), (u'police', 77.353771964761279), (u'car', 75.93989498994803), (u'ok', 70.23984099623253), (u'darling', 65.890574892253056), (u'goodbye', 64.377442625787097), (u'ah', 63.868804397011033), (u'bye', 62.34597493023827), (u'alright', 60.192429725917876), (u'dr', 59.824923596323075), (u'okay', 59.547344992358759), (u'000', 55.755744841630758), (u'brother', 55.285016227948418), (u'war', 54.620577355348118), (u'child', 53.813976462973798), (u'captain', 53.592929318460151), (u'dad', 51.786135021933717), (u'children', 48.750266474203414), (u'madam', 48.428769391443211), (u'boss', 47.812049910174501), (u'gentlemen', 47.746676062943898), (u'girls', 47.557633627225655), (u'paris', 46.148995535121479), (u'sister', 45.69372582997066), (u'daughter', 45.143083549645837), (u'school', 44.704587584127445), (u'guy', 44.417345954150505), (u'damn', 43.183431132392855), (u'anymore', 42.594616275535678), (u'uncle', 42.462916409057655), (u'train', 41.898225670446813), (u'professor', 41.315343257319711), (u'eh', 40.517713671353761), (u'marry', 39.090134194494624), (u'10', 38.635899417110338), (u'stupid', 38.426252738834464), (u'dance', 37.655620014237115), (u'madame', 37.248865691469248), (u'hotel', 36.886164646816319), (u'mom', 36.586392208411183), (u'number', 36.133510457559211), (u'wonderful', 35.852226286465324), (u'write', 35.794733441041963), (u'suppose', 35.674959637545982), (u'letter', 35.373206804457226), (u'london', 35.150880299670426), (u'general', 35.072156654250456), (u'strange', 34.922837939552124), (u'fun', 34.92164214150111), (u'hell', 34.74687100354582), (u'colonel', 34.610486960985149), (u'baby', 34.509564136218046), (u'lf', 33.887431626832758), (u'person', 33.698516343452937), (u'tired', 33.614981355675269), (u'office', 33.453528697839921), (u'boat', 33.368624669104349), (u'careful', 33.008225440076124), (u'hi', 32.981297289940088), (u'music', 32.781936803989595), (u'funny', 32.635177324264049), (u'ma', 32.591850819590746), (u'forgive', 32.363214627552168), (u'dinner', 32.215057436441015), (u'party', 32.134109245564275), (u'aunt', 32.083076916299802), (u'shoot', 31.882453722253846), (u'calm', 31.701881315853672), (u'fool', 31.55706567406471), (u'law', 31.537327023654534), (u'story', 31.454580494157042), (u'mad', 31.379624961294347), (u'yesterday', 31.34308070988132), (u'phone', 31.205204579961666), (u'tea', 31.148052455131801), (u'small', 30.877794027384731), (u'town', 30.87261836370822), (u'coffee', 30.653814767267207), (u'lovely', 30.37473975981878), (u'scared', 30.268542637253866), (u'lord', 30.193427250231068), (u'company', 30.066540205969883), (u'guys', 30.006822281321003), (u'kiss', 30.002883676898083), (u'months', 29.998958490925602), (u'died', 29.993072770027972), (u'sick', 29.983694226464852), (u'murder', 29.919617870362696), (u'fight', 29.897614827534355), (u'certainly', 29.847455469344354), (u'huh', 29.703762341975146), (u'idiot', 29.700302212027932), (u'body', 29.594252095403323), (u'question', 29.515483558251969), (u'quick', 29.275363169138828), (u'sergeant', 29.197317061869228), (u'boys', 29.163262226953805), (u'german', 29.107141965164395), (u'fault', 29.075271216246829)]

[(u'00', 61.241217838360988), (u'il', 36.30973410810288), (u'film', 33.451323379100472), (u'captain', 23.584632234086662), (u'brother', 20.703125876235031), (u'ok', 20.690674732657271), (u'gold', 19.992385336378327), (u'sea', 19.472720655472919), (u'village', 19.207294696916204), (u'city', 18.520323048318659), (u'music', 18.465933387164796), (u'boss', 18.404623150200422), (u'sister', 17.739331618841533), (u'director', 17.731259612454327), (u'daughter', 17.23925563740881), (u'child', 16.650342230320717), (u'la', 16.649290816214879), (u'act', 16.635133590528447), (u'dr', 16.510830671826803), (u'000', 16.503997134752527), (u'colonel', 16.464349305435469), (u'master', 16.140836277034794), (u'ha', 16.049521363384052), (u'war', 16.026749530565692), (u'san', 15.824401010655748), (u'german', 15.79883989003563), (u'children', 15.640149564768709), (u'land', 15.464394540294895), (u'doctor', 15.191657215125977), (u'fight', 15.176828649485056), (u'mary', 14.902008527790047), (u'police', 14.871716846405359), (u'professor', 14.690805542954394), (u'ship', 14.594393131070539), (u'prince', 14.475868331784334), (u'okay', 14.243486911968038), (u'chief', 13.974574470693781), (u'uncle', 13.925146143235338), (u'mountain', 13.916842636011328), (u'school', 13.795774165624707), (u'return', 13.791412836174471), (u'horse', 13.515227063656367), (u'paris', 13.437367388602642), (u'song', 13.417257041127264), (u'black', 13.359408010566606), (u'dad', 13.172501019998835), (u'goodbye', 13.030411887879859), (u'john', 12.968469933304995), (u'island', 12.923736048739933), (u'general', 12.905286804294505), (u'story', 12.834998087099105), (u'damn', 12.755886294104339), (u'town', 12.693362024411272), (u'lieutenant', 12.677018695065238), (u'ah', 12.435499869846845), (u'marry', 12.298391665411941), (u'king', 12.292609154369345), (u'red', 12.224024949966974), (u'earth', 12.042473890456684), (u'dance', 11.951296272157499), (u'lord', 11.950620038741686), (u'alright', 11.913208097628008), (u'car', 11.848353954886141), (u'madame', 11.78675614554135), (u'dog', 11.765869011269167), (u'white', 11.749010738794343), (u'fish', 11.666899287286189), (u'mrs', 11.551047549964597), (u'gentlemen', 11.420062818371981), (u'train', 11.393892366376917), (u'blood', 11.302063641404215), (u'died', 11.280279265997093), (u'anymore', 11.27986972526665), (u'farm', 11.210366565802193), (u'sing', 11.207733136761743), (u'guy', 11.069247906021582), (u'power', 10.930564192398847), (u'army', 10.923550326153263), (u'happiness', 10.866966843867525), (u'small', 10.848660306274772), (u'sword', 10.812056650286806), (u'boys', 10.768138508440829), (u'boat', 10.739462879484375), (u'party', 10.72831468457059), (u'directed', 10.727635826685965), (u'bye', 10.629549068792148), (u'enemy', 10.584205199673409), (u'letter', 10.582579944535453), (u'mom', 10.556716881111452), (u'sun', 10.515508334314228), (u'gonna', 10.473946136105873), (u'person', 10.397435360690531), (u'gun', 10.32176078318329), (u'secret', 10.268998045519972), (u'major', 10.199282596814395), (u'subtitles', 10.134087437326812), (u'alive', 10.133741954962158), (u'follow', 10.077238082504584), (u'count', 9.8624948394607959), (u'art', 9.8522805898439465)]

[(u'gonna', 107.51743808417224), (u'okay', 83.935160877685561), (u'mrs', 74.275636102465796), (u'uh', 67.641783541518151), (u'ain', 57.909289758974467), (u'huh', 48.474639389936804), (u'guy', 48.226697872024843), (u'darling', 45.763478338705923), (u'car', 43.70648522870728), (u'wanna', 39.170665567706436), (u'ok', 38.910269087424211), (u'town', 38.494505695152199), (u'baby', 38.039119812473189), (u'dad', 37.913317295473831), (u'captain', 37.561958650628718), (u'joe', 37.124626310256701), (u'em', 36.929787450242884), (u'gun', 36.836452412287173), (u'boys', 36.395714896846187), (u'gotta', 35.835340404035847), (u'dr', 35.324404584919513), (u'kid', 34.975280253648123), (u'police', 34.820822434390521), (u'somebody', 34.206230100254231), (u'doctor', 34.128793679387059), (u'george', 33.845813279541126), (u'hi', 33.826561633435809), (u'ma', 32.875332229082446), (u'honey', 32.697703855820492), (u'suppose', 31.913058552886248), (u'000', 31.81812146654655), (u'goodbye', 30.953483877846892), (u'john', 30.766020240590329), (u'bye', 30.695466624497026), (u'guys', 29.866236691313489), (u'charlie', 29.17315453562647), (u'lieutenant', 28.894325317513633), (u'ha', 28.411268944033498), (u'wonderful', 28.410120860524898), (u'phone', 28.264827089323671), (u'brother', 28.199932742455708), (u'anybody', 27.869643953059693), (u'colonel', 27.721705537664047), (u'york', 27.632859722989583), (u'jack', 26.742294550326754), (u'horse', 26.501798906148988), (u'lf', 26.453738131430224), (u'00', 26.297976780569783), (u'boss', 26.13138289842313), (u'office', 26.071981208283372), (u'girls', 26.06022340076747), (u'gentlemen', 25.943786316573672), (u'ought', 25.606042472908978), (u'ah', 25.54476343288859), (u'doc', 25.427510999354574), (u'war', 25.227993363851748), (u'bet', 24.666091745663024), (u'shot', 24.646189916634263), (u'10', 24.599818387904165), (u'number', 24.439797211458831), (u'certainly', 24.329185665028245), (u'hell', 24.196137934753235), (u'funny', 23.811826176848616), (u'hotel', 23.719050237377722), (u'dance', 23.673757363931657), (u'dinner', 23.670872320099395), (u'week', 23.616167627784947), (u'boat', 23.584595525523277), (u'hmm', 23.434848655290345), (u'kids', 23.271255228672366), (u'till', 23.242648079107227), (u'couple', 22.966225494430503), (u'hit', 22.665843790868344), (u'pop', 22.341781354019218), (u'murder', 22.275183536159258), (u'party', 22.274811324728837), (u'check', 22.029274793903106), (u'train', 22.011934171945477), (u'fight', 21.864380383778656), (u'professor', 21.858784176726481), (u'stuff', 21.827162989797294), (u'ya', 21.700743508850021), (u'pick', 21.671081446188701), (u'story', 21.579452582270136), (u'school', 21.546243547325428), (u'sergeant', 21.337786242281311), (u'daddy', 21.314074959772448), (u'city', 21.193060556723164), (u'probably', 21.068544531616546), (u'marry', 21.025524266685686), (u'uncle', 20.876262647152835), (u'mary', 20.647851132735077), (u'deal', 20.3727703949573), (u'mom', 20.219730848553091), (u'music', 20.163161185311282), (u'shoot', 20.122747703706082), (u'dollars', 19.94223719554984), (u'children', 19.805535044032005), (u'white', 19.774816849663871), (u'line', 19.639365499679748)]

In [28]:
if reduce_dimensionality == True:
    mpld3.enable_notebook()
    fig,ax = plt.subplots(figsize=(15,10)) #.figure(figsize=(20,10))
#     ax = fig.add_subplot(1,1,1)
#     ax.grid(True, alpha=0.3)

    colors = [(random.random(), random.random(), random.random()) for x in range(N_CLUSTERS)]
    for k, col in zip(range(N_CLUSTERS), colors):
        my_members = k_means_labels == k
        cluster_center = k_means_cluster_centers[k]
        points = ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.', label='Cluster %i' % k)
        centers = ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)
        
        labels = []
        for movie in movies[k_means_labels == k]:
            labels.append(movie.get('Title', '') + " " + movie.get('osID', '') + " " + movie.get('imdbID', '') + " " + ", ".join(movie.get('Genre', '')) + " " + movie.get('', '') + " ")

        tooltip = plugins.PointHTMLTooltip(points[0], labels, voffset=10, hoffset=10)
        plugins.connect(fig, tooltip)

    ax.set_title('KMeans')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.legend()