Initial Set Up

In [1]:
from bs4 import BeautifulSoup
import urllib2
import re
import dateutil.parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import seaborn as sns
import matplotlib as mpl
%matplotlib inline

url = "http://boxofficemojo.com/oscar/bestpichist.htm?view=bymovie&sort=year&order=DESC&p=.htm"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)

Movie Value Function

In [2]:
# Define movie value function that allows for quick searching in web page
def get_movie_value(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    if not obj:
        return None
    next_value = obj.find_next()
    if next_value.text:
        return next_value.text
    else:
        return None

Movie Page Urls

In [10]:
#pulls out URLs from Oscar Page and changes to correct url
def get_url_from_oscarlist(urlname):
   page = urllib2.urlopen(urlname)
   soup_name = BeautifulSoup(page)
   
   movie_url_list = []
   for link in soup_name.find_all('a', href=re.compile('oscar/movies')):
       movie_nominees = ("http://boxofficemojo.com" + link.get('href'))
       movie_url_list.append(movie_nominees)
   return movie_url_list
    
#"&adjust_yr=2015&p=.htm"
# subset data set to just get links since 2000
nominees_url = get_url_from_oscarlist(url)[:100]

# just print first few for example
nominees_url[:10]
Out[10]:
['http://boxofficemojo.com/oscar/movies/?id=americansniper.htm',
 'http://boxofficemojo.com/oscar/movies/?id=selma.htm',
 'http://boxofficemojo.com/oscar/movies/?id=imitationgame.htm',
 'http://boxofficemojo.com/oscar/movies/?id=whiplash.htm',
 'http://boxofficemojo.com/oscar/movies/?id=birdman.htm',
 'http://boxofficemojo.com/oscar/movies/?id=theoryofeverything.htm',
 'http://boxofficemojo.com/oscar/movies/?id=boyhood.htm',
 'http://boxofficemojo.com/oscar/movies/?id=grandbudapesthotel.htm',
 'http://boxofficemojo.com/oscar/movies/?id=her2013.htm',
 'http://boxofficemojo.com/oscar/movies/?id=nebraska.htm']

Oscar Nominee Urls

In [11]:
#pulls out URLs from Oscar Page and changes Url
def get_oscar_url_from_oscarlist(urlname):
   page = urllib2.urlopen(urlname)
   soup_name = BeautifulSoup(page)
   
   movie_url_list = []
   for link in soup_name.find_all('a', href=re.compile('oscar/movies')):
       one_year_movies = ("http://boxofficemojo.com" + link.get('href').split('/oscar')[1])
       movie_url_list.append(one_year_movies)
   return movie_url_list
    
# subset url set to just get links since 2000
final_url = get_oscar_url_from_oscarlist(url)[:100]

final_url[:10]
Out[11]:
['http://boxofficemojo.com/movies/?id=americansniper.htm',
 'http://boxofficemojo.com/movies/?id=selma.htm',
 'http://boxofficemojo.com/movies/?id=imitationgame.htm',
 'http://boxofficemojo.com/movies/?id=whiplash.htm',
 'http://boxofficemojo.com/movies/?id=birdman.htm',
 'http://boxofficemojo.com/movies/?id=theoryofeverything.htm',
 'http://boxofficemojo.com/movies/?id=boyhood.htm',
 'http://boxofficemojo.com/movies/?id=grandbudapesthotel.htm',
 'http://boxofficemojo.com/movies/?id=her2013.htm',
 'http://boxofficemojo.com/movies/?id=nebraska.htm']

Get nominations, wins, production budget from movie page urls

In [12]:
nominations = []
wins = []
productions = []
title = []
    
for url in nominees_url:

#     print '----%s-----' % url
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    raw_nominations = int(get_movie_value(soup, "Total Nominations:"))
    nominations.append(raw_nominations)
#     print raw_nominations
    raw_wins = int(get_movie_value(soup, "Total Wins:"))
    wins.append(raw_wins)
#     print raw_wins
    raw_production = get_movie_value(soup, "Prod. Budget:")
    if raw_production == None:
        productions.append(np.nan)
        continue
    production_num = raw_production.replace('$','').replace('million','')
    production_total = int(float(production_num)*1000000)
    productions.append(production_total)
#     print production_total
    
   

Make lists with top directors, actors, writers, producer, distributor

In [14]:
title = []

directors = []
director_data = []

actors = []
actor_data = []

writers = []
writer_data = []

producers = []
producer_data = []

distributors = []
distributor_data = []

for url in final_url:

    
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    title_string = soup.find('title').text
    raw_title = title_string.split('(')[0].strip()
    title.append(raw_title)
    
    
    director = soup.find(text=re.compile('Director'))
    if director is None:
        print None
        continue
    director = director.find_next()
    try:
        for row in director.find_all('a'):
            directors.append(row.text)
#             print raw_title, row.text
            director_dict= {"Title": raw_title, "Director": row.text.strip('*')}
            director_data.append(director_dict)
    except:
        print None 
    
    
    actor = soup.find(text=re.compile('Actor'))
    if actor is None:
        print None
        continue
    actor = actor.find_next()
    try:
        for row in actor.find_all('a'):
            actors.append(row.text)
#             print raw_title, row.text
            actor_dict= {"Title": raw_title, "Actor": row.text.strip('*')}
            actor_data.append(actor_dict)
    except:
        print None
        
    
    writer = soup.find(text=re.compile('Writer'))
    if writer is None:
        print None
        continue
    writer = writer.find_next()
    try:
        for row in writer.find_all('a'):
            writers.append(row.text)
#             print raw_title, row.text
            writer_dict= {"Title": raw_title, "Writer": row.text.strip('*')}
            writer_data.append(writer_dict)
    except:
        print None 
    
    
    producer = soup.find(text=re.compile('Producer'))
    if producer is None:
        print None
        continue
    producer = producer.find_next()
    try:
        for row in producer.find_all('a'):
            producers.append(row.text)
#             print raw_title, row.text
            producer_dict= {"Title": raw_title, "Producer": row.text.strip("(co-producer)")}
            producer_data.append(producer_dict)
    except:
        print None 
    
    
    
    distributor = soup.find(text=re.compile('Distributor'))
    if distributor is None:
        print None
        continue
    distributor = distributor.find_next()
    try:
        for row in distributor.find_all('a'):
            distributors.append(row.text)
#             print raw_title, row.text
            distributor_dict= {"Title": raw_title, "Distributor": row.text}
            distributor_data.append(distributor_dict)
    except:
        print None 
    
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
In [15]:
# Convert above lists to dataframes and label them
# Using pre 2014 years to not skew data

all_wins=pd.DataFrame(wins[:8])
all_noms = pd.DataFrame(nominations[:8])
all_titles=pd.DataFrame(title[:8])

all_wins.columns=["Wins"]
all_noms.columns=["Nominations"]
all_titles.columns=["Title"]

pre_14_wins=pd.DataFrame(wins[8:])
pre_14_noms = pd.DataFrame(nominations[8:])
pre_14_titles=pd.DataFrame(title[8:])

pre_14_wins.columns=["Wins"]
pre_14_noms.columns=["Nominations"]
pre_14_titles.columns=["Title"]

all_titles
Out[15]:
Title
0 American Sniper
1 Selma
2 The Imitation Game
3 Whiplash
4 Birdman
5 The Theory of Everything
6 Boyhood
7 The Grand Budapest Hotel

Original DF with Nominations, Wins, and Title

In [17]:
#join above data frames
df = pd.DataFrame(pre_14_noms.join(pre_14_wins))
original_df  = pd.DataFrame(df.join(pre_14_titles))
original_df.head(15)
Out[17]:
Nominations Wins Title
0 5 1 Her
1 6 0 Nebraska
2 4 0 Philomena
3 6 3 Dallas Buyers Club
4 5 0 The Wolf of Wall Street
5 10 0 American Hustle
6 9 3 12 Years a Slave
7 10 7 Gravity
8 6 0 Captain Phillips
9 5 1 Zero Dark Thirty
10 5 1 Amour
11 8 1 Silver Linings Playbook
12 5 2 Django Unchained
13 8 3 Les Miserables
14 12 2 Lincoln
In [18]:
#Original summary of all nominations vs wins
original_df['Ones'] = 1.0
X = original_df[['Nominations', 'Ones']]
Y = original_df['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[18]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.392
Model: OLS Adj. R-squared: 0.386
Method: Least Squares F-statistic: 58.14
Date: Sat, 31 Jan 2015 Prob (F-statistic): 2.40e-11
Time: 17:29:16 Log-Likelihood: -173.33
No. Observations: 92 AIC: 350.7
Df Residuals: 90 BIC: 355.7
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.4854 0.064 7.625 0.000 0.359 0.612
Ones -1.3572 0.466 -2.913 0.005 -2.283 -0.432
Omnibus: 27.040 Durbin-Watson: 2.342
Prob(Omnibus): 0.000 Jarque-Bera (JB): 64.553
Skew: 1.021 Prob(JB): 9.60e-15
Kurtosis: 6.559 Cond. No. 20.6
In [19]:
# Use seaborn to plot linear regression w/ 95% confidence bands
sns.lmplot("Nominations", "Wins", original_df);

Top Actors

In [20]:
#Convert dictionary to data frame and reorder columns
actor_data_df = pd.DataFrame(actor_data)
actor_data_df.reindex(columns=['Title', 'Actor'])

# make new object that is groupby of actors
just_actors = actor_data_df.groupby('Actor')
In [21]:
top_actors = just_actors.count().sort('Title', ascending= False).head(8)
top_actors
Out[21]:
Title
Actor
Leonardo DiCaprio 6
Cate Blanchett 6
Brad Pitt 6
George Clooney 5
Tom Wilkinson 4
Sandra Bullock 4
Russell Crowe 4
Benedict Cumberbatch 4
In [22]:
#Get list of just  actors above
l = just_actors.count().sort('Title', ascending= False).head(8)
top_actors_list = list(l.index)
# print top_actors_list

# gets movies that are just in above list and then drop the duplicates
top_titles_actors_dups = actor_data_df[actor_data_df['Actor'].isin(top_actors_list)]
top_titles_actors = top_titles_actors_dups.drop_duplicates('Title')

top_titles_actors_list = list(top_titles_actors['Title'])
# print top_titles_actors_list

#make new df from original df and take out just the data that matches titles that top actors were in
top_actor_movies = original_df[original_df['Title'].isin(top_titles_actors_list)]
top_actor_movies
Out[22]:
Nominations Wins Title Ones
4 5 0 The Wolf of Wall Street 1
6 9 3 12 Years a Slave 1
7 10 7 Gravity 1
12 5 2 Django Unchained 1
13 8 3 Les Miserables 1
18 2 0 Extremely Loud & Incredibly Close 1
20 5 1 The Descendants 1
22 6 0 War Horse 1
24 6 0 Moneyball 1
26 3 0 The Tree of Life 1
36 8 4 Inception 1
40 6 0 Up in the Air 1
41 2 1 The Blind Side 1
43 8 1 Inglourious Basterds 1
51 13 3 The Curious Case of Benjamin Button 1
53 7 1 Atonement 1
56 7 1 Michael Clayton 1
59 7 1 Babel 1
60 5 4 The Departed 1
64 6 0 Good Night, and Good Luck. 1
66 6 3 Crash 1
69 11 5 The Aviator 1
74 10 2 Master and Commander: The Far Side of the World 1
75 11 11 The Lord of the Rings: The Return of the King 1
80 10 0 Gangs of New York 1
81 6 2 The Lord of the Rings: The Two Towers 1
82 5 0 In the Bedroom 1
84 8 4 A Beautiful Mind 1
85 13 4 The Lord of the Rings: The Fellowship of the Ring 1
90 12 5 Gladiator 1
In [418]:
# Looking at specific titles for an actor
actor_data_df[actor_data_df['Actor']=='Benedict Cumberbatch']
Out[418]:
Actor Title
6 Benedict Cumberbatch The Imitation Game
55 Benedict Cumberbatch 12 Years a Slave
109 Benedict Cumberbatch War Horse
232 Benedict Cumberbatch Atonement
In [265]:
#Graph Summary of nominations of top actor's films vs wins
top_actor_movies['Ones'] = 1.0
X = top_actor_movies[['Nominations', 'Ones']]
Y = top_actor_movies['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[265]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.327
Model: OLS Adj. R-squared: 0.305
Method: Least Squares F-statistic: 15.04
Date: Fri, 30 Jan 2015 Prob (F-statistic): 0.000511
Time: 00:16:18 Log-Likelihood: -69.852
No. Observations: 33 AIC: 143.7
Df Residuals: 31 BIC: 146.7
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.4771 0.123 3.878 0.001 0.226 0.728
Ones -1.3945 0.961 -1.451 0.157 -3.355 0.566
Omnibus: 16.322 Durbin-Watson: 2.501
Prob(Omnibus): 0.000 Jarque-Bera (JB): 20.930
Skew: 1.313 Prob(JB): 2.85e-05
Kurtosis: 5.885 Cond. No. 21.1
In [250]:
sns.lmplot("Nominations", "Wins", top_actor_movies)
Out[250]:
<seaborn.axisgrid.FacetGrid at 0x112d25450>

Top Directors

In [23]:
#Convert dictionary to data frame and reorder columns
direc_df = pd.DataFrame(director_data)
direc_df.reindex(columns=['Title', 'Director'])

# make new object that is groupby of directors
top_direcs = direc_df.groupby('Director')
In [369]:
# Find out who were the top 10 directors
top_directors = top_direcs.count().sort('Title', ascending= False).head(10)
top_directors
Out[369]:
Title
Director
Martin Scorsese 5
Clint Eastwood 4
Stephen Daldry 3
David O. Russell 3
Ang Lee 3
Peter Jackson 3
Alexander Payne 3
Joel Coen 3
Steven Spielberg 3
Ethan Coen 3
In [368]:
direc_df[direc_df['Director']=='Ethan Coen']
Out[368]:
Director Title
37 Ethan Coen True Grit
52 Ethan Coen A Serious Man
64 Ethan Coen No Country for Old Men
In [283]:
#Get list of just directors above
direc = top_direcs.count().sort('Title', ascending= False).head(10)
top_directors_list = list(direc.index)
# print top_directors_list

# gets movies that are just in above list and then drop the duplicates
top_titles_directors_dups = direc_df[direc_df['Director'].isin(top_directors_list)]
top_titles_directors = top_titles_directors_dups.drop_duplicates('Title')

top_titles_directors_list = list(top_titles_directors['Title'])
# print top_titles_directors_list

#make new df from original df and take out just the data that matches titles that top actors were in
top_directors_movies = original_df[original_df['Title'].isin(top_titles_directors_list)]
top_directors_movies
Out[283]:
Nominations Wins Title Ones
1 6 0 Nebraska 1
4 5 0 The Wolf of Wall Street 1
5 10 0 American Hustle 1
11 8 1 Silver Linings Playbook 1
14 12 2 Lincoln 1
15 11 4 Life of Pi 1
18 2 0 Extremely Loud & Incredibly Close 1
20 5 1 The Descendants 1
21 11 5 Hugo 1
22 6 0 War Horse 1
29 7 2 The Fighter 1
31 10 0 True Grit 1
45 2 0 A Serious Man 1
48 5 1 The Reader 1
55 8 4 No Country for Old Men 1
57 4 1 Letters from Iwo Jima 1
60 5 4 The Departed 1
63 8 3 Brokeback Mountain 1
65 5 0 Munich 1
67 7 4 Million Dollar Baby 1
68 5 1 Sideways 1
69 11 5 The Aviator 1
72 6 2 Mystic River 1
75 11 11 The Lord of the Rings: The Return of the King 1
77 9 1 The Hours 1
80 10 0 Gangs of New York 1
81 6 2 The Lord of the Rings: The Two Towers 1
85 13 4 The Lord of the Rings: The Fellowship of the Ring 1
88 10 4 Crouching Tiger, Hidden Dragon 1
In [332]:
#Graph Summary of nominations of top director's films vs wins
top_directors_movies['Ones'] = 1.0
X = top_directors_movies[['Nominations', 'Ones']]
Y = top_directors_movies['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[332]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.259
Model: OLS Adj. R-squared: 0.232
Method: Least Squares F-statistic: 9.438
Date: Fri, 30 Jan 2015 Prob (F-statistic): 0.00481
Time: 02:16:25 Log-Likelihood: -62.052
No. Observations: 29 AIC: 128.1
Df Residuals: 27 BIC: 130.8
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.4163 0.136 3.072 0.005 0.138 0.694
Ones -0.9916 1.093 -0.907 0.372 -3.234 1.251
Omnibus: 17.117 Durbin-Watson: 1.929
Prob(Omnibus): 0.000 Jarque-Bera (JB): 24.855
Skew: 1.314 Prob(JB): 4.01e-06
Kurtosis: 6.697 Cond. No. 22.6
In [333]:
sns.lmplot("Nominations", "Wins", top_directors_movies)
Out[333]:
<seaborn.axisgrid.FacetGrid at 0x11777f090>

Top Producers

In [24]:
#Convert dictionary to data frame and reorder columns
produce_df = pd.DataFrame(producer_data)
produce_df.reindex(columns=['Title', 'Producer'])

# make new object that is groupby of producers
top_produces = produce_df.groupby('Producer')
In [372]:
top_producers = top_produces.count().sort('Title', ascending= False).head(3)
top_producers
Out[372]:
Title
Producer
Scott Rudin 10
Brad Pitt 5
Graham King 5
In [375]:
produce_df[produce_df['Producer']=='Graham King']
Out[375]:
Producer Title
69 Graham King Argo
76 Graham King Hugo
163 Graham King The Departed
189 Graham King The Aviator
205 Graham King Gangs of New York
In [29]:
#Get list of just producers above
prod = top_produces.count().sort('Title', ascending= False).head(3)
top_producers_list = list(prod.index)

# print top_producers_list

# gets movies that are just in above list and then drop the duplicates
top_titles_producers_dups = produce_df[produce_df['Producer'].isin(top_producers_list)]
top_titles_producers = top_titles_producers_dups.drop_duplicates('Title')

top_titles_producers_list = list(top_titles_producers['Title'])
# print top_titles_producers_list

#make new df from original df and take out just the data that matches titles that top actors were in
top_producers_movies = original_df[original_df['Title'].isin(top_titles_producers_list)]
top_producers_movies
Out[29]:
Nominations Wins Title Ones
6 9 3 12 Years a Slave 1
8 6 0 Captain Phillips 1
16 7 3 Argo 1
18 2 0 Extremely Loud & Incredibly Close 1
21 11 5 Hugo 1
24 6 0 Moneyball 1
26 3 0 The Tree of Life 1
31 10 0 True Grit 1
33 8 3 The Social Network 1
52 8 2 There Will Be Blood 1
55 8 4 No Country for Old Men 1
58 6 1 The Queen 1
60 5 4 The Departed 1
69 11 5 The Aviator 1
77 9 1 The Hours 1
80 10 0 Gangs of New York 1
In [335]:
#Graph Summary of nominations of top producer's films vs wins
top_producers_movies['Ones'] = 1.0
X = top_producers_movies[['Nominations', 'Ones']]
Y = top_producers_movies['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[335]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.204
Model: OLS Adj. R-squared: 0.147
Method: Least Squares F-statistic: 3.583
Date: Fri, 30 Jan 2015 Prob (F-statistic): 0.0792
Time: 02:16:59 Log-Likelihood: -30.749
No. Observations: 16 AIC: 65.50
Df Residuals: 14 BIC: 67.04
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.3251 0.172 1.893 0.079 -0.043 0.693
Ones -0.4802 1.352 -0.355 0.728 -3.379 2.419
Omnibus: 0.845 Durbin-Watson: 1.915
Prob(Omnibus): 0.655 Jarque-Bera (JB): 0.678
Skew: -0.080 Prob(JB): 0.712
Kurtosis: 2.004 Cond. No. 24.4
In [336]:
sns.lmplot("Nominations", "Wins", top_producers_movies)
Out[336]:
<seaborn.axisgrid.FacetGrid at 0x1179167d0>

Top Writers

In [25]:
#Convert dictionary to data frame and reorder columns
write_df = pd.DataFrame(writer_data)
write_df.reindex(columns=['Title', 'Writer'])

# make new object that is groupby of writers
top_writes = write_df.groupby('Writer')
In [308]:
top_writers = top_writes.count().sort('Title', ascending= False).head(5)
top_writers
Out[308]:
Title
Writer
Philippa Boyens 3
Paul Haggis 3
Fran Walsh 3
Eric Roth 3
John Logan 3
In [417]:
write_df[write_df['Writer']=='Philippa Boyens']
Out[417]:
Title Writer
87 The Lord of the Rings: The Return of the King Philippa Boyens
94 The Lord of the Rings: The Two Towers Philippa Boyens
97 The Lord of the Rings: The Fellowship of the Ring Philippa Boyens
In [329]:
#Get list of just writers above
writ = top_writes.count().sort('Title', ascending= False).head(5)
top_writers_list = list(writ.index)

# print top_writers_list

# gets movies that are just in above list and then drop the duplicates
top_titles_writers_dups = write_df[write_df['Writer'].isin(top_writers_list)]
top_titles_writers = top_titles_writers_dups.drop_duplicates('Title')

top_titles_writers_list = list(top_titles_writers['Title'])
# print top_titles_writers_list

#make new df from original df and take out just the data that matches titles that top actors were in
top_writers_movies = original_df[original_df['Title'].isin(top_titles_writers_list)]
top_writers_movies
Out[329]:
Nominations Wins Title Ones
18 2 0 Extremely Loud & Incredibly Close 1
21 11 5 Hugo 1
51 13 3 The Curious Case of Benjamin Button 1
57 4 1 Letters from Iwo Jima 1
65 5 0 Munich 1
66 6 3 Crash 1
67 7 4 Million Dollar Baby 1
69 11 5 The Aviator 1
75 11 11 The Lord of the Rings: The Return of the King 1
81 6 2 The Lord of the Rings: The Two Towers 1
85 13 4 The Lord of the Rings: The Fellowship of the Ring 1
90 12 5 Gladiator 1
In [338]:
#Graph Summary of nominations of top writer's films vs wins
top_writers_movies['Ones'] = 1.0
X = top_writers_movies[['Nominations', 'Ones']]
Y = top_writers_movies['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[338]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.432
Model: OLS Adj. R-squared: 0.375
Method: Least Squares F-statistic: 7.606
Date: Fri, 30 Jan 2015 Prob (F-statistic): 0.0202
Time: 02:17:44 Log-Likelihood: -26.167
No. Observations: 12 AIC: 56.33
Df Residuals: 10 BIC: 57.30
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.5101 0.185 2.758 0.020 0.098 0.922
Ones -0.7100 1.698 -0.418 0.685 -4.493 3.073
Omnibus: 14.559 Durbin-Watson: 1.965
Prob(Omnibus): 0.001 Jarque-Bera (JB): 9.337
Skew: 1.613 Prob(JB): 0.00939
Kurtosis: 5.875 Cond. No. 23.2
In [339]:
sns.lmplot("Nominations", "Wins", top_writers_movies)
Out[339]:
<seaborn.axisgrid.FacetGrid at 0x1170a38d0>

Top Distributors

In [27]:
#Convert dictionary to data frame and reorder columns
distribute_df = pd.DataFrame(distributor_data)
distribute_df.reindex(columns=['Title', 'Distributor'])

# make new object that is groupby of writers
top_distributes = distribute_df.groupby('Distributor')
In [381]:
top_distributors = top_distributes.count().sort('Title', ascending= False).head(6)
top_distributors
Out[381]:
Title
Distributor
Warner Bros. 12
Fox Searchlight 10
Paramount 7
Weinstein Company 7
Miramax 6
Universal 6
In [387]:
distribute_df[distribute_df['Distributor']=='Universal']
Out[387]:
Distributor Title
17 Universal Les Miserables
45 Universal Frost/Nixon
59 Universal Munich
68 Universal Seabiscuit
73 Universal A Beautiful Mind
77 Universal Erin Brockovich
In [32]:
#Get list of just distributors above
writ = top_distributes.count().sort('Title', ascending= False).head(6)
top_distributors_list = list(writ.index)

# print top_distributors_list

# gets movies that are just in above list and then drop the duplicates
top_titles_distributors_dups = distribute_df[distribute_df['Distributor'].isin(top_distributors_list)]
top_titles_distributors = top_titles_distributors_dups.drop_duplicates('Title')

top_titles_distributors_list = list(top_titles_distributors['Title'])
# print top_titles_distributors_list

#make new df from original df and take out just the data that matches titles that top actors were in
top_distributors_movies = original_df[original_df['Title'].isin(top_titles_distributors_list)]
top_distributors_movies
Out[32]:
Nominations Wins Title Ones
0 5 1 Her 1
2 4 0 Philomena 1
4 5 0 The Wolf of Wall Street 1
6 9 3 12 Years a Slave 1
7 10 7 Gravity 1
12 5 2 Django Unchained 1
13 8 3 Les Miserables 1
16 7 3 Argo 1
18 2 0 Extremely Loud & Incredibly Close 1
19 10 5 The Artist 1
20 5 1 The Descendants 1
21 11 5 Hugo 1
26 3 0 The Tree of Life 1
27 12 4 The King's Speech 1
28 6 0 127 Hours 1
30 5 1 Black Swan 1
31 10 0 True Grit 1
36 8 4 Inception 1
40 6 0 Up in the Air 1
41 2 1 The Blind Side 1
43 8 1 Inglourious Basterds 1
47 5 0 Frost/Nixon 1
48 5 1 The Reader 1
51 13 3 The Curious Case of Benjamin Button 1
54 4 1 Juno 1
55 8 4 No Country for Old Men 1
56 7 1 Michael Clayton 1
57 4 1 Letters from Iwo Jima 1
58 6 1 The Queen 1
60 5 4 The Departed 1
61 4 2 Little Miss Sunshine 1
65 5 0 Munich 1
67 7 4 Million Dollar Baby 1
68 5 1 Sideways 1
69 11 5 The Aviator 1
70 7 1 Finding Neverland 1
72 6 2 Mystic River 1
76 7 0 Seabiscuit 1
77 9 1 The Hours 1
78 13 6 Chicago 1
80 10 0 Gangs of New York 1
84 8 4 A Beautiful Mind 1
91 5 1 Erin Brockovich 1
In [340]:
#Graph Summary of nominations of top writer's films vs wins
top_distributors_movies['Ones'] = 1.0
X = top_distributors_movies[['Nominations', 'Ones']]
Y = top_distributors_movies['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[340]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.399
Model: OLS Adj. R-squared: 0.384
Method: Least Squares F-statistic: 27.21
Date: Fri, 30 Jan 2015 Prob (F-statistic): 5.58e-06
Time: 02:18:23 Log-Likelihood: -77.487
No. Observations: 43 AIC: 159.0
Df Residuals: 41 BIC: 162.5
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.4332 0.083 5.216 0.000 0.265 0.601
Ones -1.0183 0.614 -1.658 0.105 -2.259 0.222
Omnibus: 0.282 Durbin-Watson: 2.401
Prob(Omnibus): 0.869 Jarque-Bera (JB): 0.006
Skew: -0.005 Prob(JB): 0.997
Kurtosis: 3.055 Cond. No. 20.1
In [341]:
sns.lmplot("Nominations", "Wins", top_distributors_movies)
Out[341]:
<seaborn.axisgrid.FacetGrid at 0x1171c4150>

Top actor, producer, and distributor

In [34]:
top_titles_actors_list
top_titles_producers_list
top_titles_distributors_list

first = top_titles_actors_list
second = top_titles_producers_list
third = list(set(first) | set(second))
# len(first)
# len(second)

fourth = list(set(third) | set(top_titles_distributors_list))
fourth

actor_producer_distrib = original_df[original_df['Title'].isin(fourth)]

actor_producer_distrib
Out[34]:
Nominations Wins Title Ones
0 5 1 Her 1
2 4 0 Philomena 1
4 5 0 The Wolf of Wall Street 1
6 9 3 12 Years a Slave 1
7 10 7 Gravity 1
8 6 0 Captain Phillips 1
12 5 2 Django Unchained 1
13 8 3 Les Miserables 1
16 7 3 Argo 1
18 2 0 Extremely Loud & Incredibly Close 1
19 10 5 The Artist 1
20 5 1 The Descendants 1
21 11 5 Hugo 1
22 6 0 War Horse 1
24 6 0 Moneyball 1
26 3 0 The Tree of Life 1
27 12 4 The King's Speech 1
28 6 0 127 Hours 1
30 5 1 Black Swan 1
31 10 0 True Grit 1
33 8 3 The Social Network 1
36 8 4 Inception 1
40 6 0 Up in the Air 1
41 2 1 The Blind Side 1
43 8 1 Inglourious Basterds 1
47 5 0 Frost/Nixon 1
48 5 1 The Reader 1
51 13 3 The Curious Case of Benjamin Button 1
52 8 2 There Will Be Blood 1
53 7 1 Atonement 1
54 4 1 Juno 1
55 8 4 No Country for Old Men 1
56 7 1 Michael Clayton 1
57 4 1 Letters from Iwo Jima 1
58 6 1 The Queen 1
59 7 1 Babel 1
60 5 4 The Departed 1
61 4 2 Little Miss Sunshine 1
64 6 0 Good Night, and Good Luck. 1
65 5 0 Munich 1
66 6 3 Crash 1
67 7 4 Million Dollar Baby 1
68 5 1 Sideways 1
69 11 5 The Aviator 1
70 7 1 Finding Neverland 1
72 6 2 Mystic River 1
74 10 2 Master and Commander: The Far Side of the World 1
75 11 11 The Lord of the Rings: The Return of the King 1
76 7 0 Seabiscuit 1
77 9 1 The Hours 1
78 13 6 Chicago 1
80 10 0 Gangs of New York 1
81 6 2 The Lord of the Rings: The Two Towers 1
82 5 0 In the Bedroom 1
84 8 4 A Beautiful Mind 1
85 13 4 The Lord of the Rings: The Fellowship of the Ring 1
90 12 5 Gladiator 1
91 5 1 Erin Brockovich 1
In [405]:
#Graph Summary of nominations of top writer's films vs wins
actor_producer_distrib['Ones'] = 1.0 # add ones column for statsmodels regression
X = actor_producer_distrib[['Nominations', 'Ones']]
Y = actor_producer_distrib['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[405]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.406
Model: OLS Adj. R-squared: 0.396
Method: Least Squares F-statistic: 38.33
Date: Fri, 30 Jan 2015 Prob (F-statistic): 7.43e-08
Time: 03:25:50 Log-Likelihood: -111.99
No. Observations: 58 AIC: 228.0
Df Residuals: 56 BIC: 232.1
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.5096 0.082 6.191 0.000 0.345 0.675
Ones -1.5858 0.626 -2.534 0.014 -2.839 -0.332
Omnibus: 22.234 Durbin-Watson: 2.500
Prob(Omnibus): 0.000 Jarque-Bera (JB): 47.615
Skew: 1.128 Prob(JB): 4.58e-11
Kurtosis: 6.822 Cond. No. 21.7
In [406]:
sns.lmplot("Nominations", "Wins", actor_producer_distrib)
Out[406]:
<seaborn.axisgrid.FacetGrid at 0x117f80a50>

Top actor and distributor

In [409]:
top_titles_actors_list
top_titles_distributors_list


actor_distributor_list = list(set(top_titles_actors_list) | set(top_titles_distributors_list))
actor_distributor = actor_producer_distrib = original_df[original_df['Title'].isin(actor_distributor_list)]
In [410]:
#Graph Summary of nominations of top writer's films vs wins
actor_distributor['Ones'] = 1.0
X = actor_distributor[['Nominations', 'Ones']]
Y = actor_distributor['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[410]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.405
Model: OLS Adj. R-squared: 0.394
Method: Least Squares F-statistic: 36.10
Date: Fri, 30 Jan 2015 Prob (F-statistic): 1.76e-07
Time: 03:32:34 Log-Likelihood: -107.19
No. Observations: 55 AIC: 218.4
Df Residuals: 53 BIC: 222.4
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.5058 0.084 6.008 0.000 0.337 0.675
Ones -1.5321 0.641 -2.390 0.020 -2.818 -0.246
Omnibus: 20.766 Durbin-Watson: 2.427
Prob(Omnibus): 0.000 Jarque-Bera (JB): 41.690
Skew: 1.105 Prob(JB): 8.85e-10
Kurtosis: 6.648 Cond. No. 21.2
In [411]:
sns.lmplot("Nominations", "Wins", actor_distributor)
Out[411]:
<seaborn.axisgrid.FacetGrid at 0x116f73810>

Top director and distributor

In [414]:
top_titles_distributors_list
top_titles_directors_list

director_distributor_list = list(set(top_titles_directors_list) | set(top_titles_distributors_list))
director_distributor = original_df[original_df['Title'].isin(director_distributor_list)]
In [415]:
#Graph Summary of nominations of top writer's films vs wins
director_distributor['Ones'] = 1.0
X = director_distributor[['Nominations', 'Ones']]
Y = director_distributor['Wins']
linmodel = sm.OLS(Y, X).fit()
linmodel.summary()
Out[415]:
OLS Regression Results
Dep. Variable: Wins R-squared: 0.361
Model: OLS Adj. R-squared: 0.349
Method: Least Squares F-statistic: 30.45
Date: Fri, 30 Jan 2015 Prob (F-statistic): 9.96e-07
Time: 03:38:43 Log-Likelihood: -110.41
No. Observations: 56 AIC: 224.8
Df Residuals: 54 BIC: 228.9
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Nominations 0.4525 0.082 5.518 0.000 0.288 0.617
Ones -1.1830 0.638 -1.853 0.069 -2.463 0.097
Omnibus: 20.968 Durbin-Watson: 2.408
Prob(Omnibus): 0.000 Jarque-Bera (JB): 44.623
Skew: 1.075 Prob(JB): 2.04e-10
Kurtosis: 6.808 Cond. No. 21.3
In [416]:
sns.lmplot("Nominations", "Wins", director_distributor)
Out[416]:
<seaborn.axisgrid.FacetGrid at 0x117efef90>