from bs4 import BeautifulSoup import urllib2 import re import dateutil.parser import numpy as np import pandas as pd import matplotlib.pyplot as plt import statsmodels.formula.api as sm import seaborn as sns import matplotlib as mpl %matplotlib inline url = "http://boxofficemojo.com/oscar/bestpichist.htm?view=bymovie&sort=year&order=DESC&p=.htm" page = urllib2.urlopen(url) soup = BeautifulSoup(page) # Define movie value function that allows for quick searching in web page def get_movie_value(soup, field_name): obj = soup.find(text=re.compile(field_name)) if not obj: return None next_value = obj.find_next() if next_value.text: return next_value.text else: return None #pulls out URLs from Oscar Page and changes to correct url def get_url_from_oscarlist(urlname): page = urllib2.urlopen(urlname) soup_name = BeautifulSoup(page) movie_url_list = [] for link in soup_name.find_all('a', href=re.compile('oscar/movies')): movie_nominees = ("http://boxofficemojo.com" + link.get('href')) movie_url_list.append(movie_nominees) return movie_url_list #"&adjust_yr=2015&p=.htm" # subset data set to just get links since 2000 nominees_url = get_url_from_oscarlist(url)[:100] # just print first few for example nominees_url[:10] #pulls out URLs from Oscar Page and changes Url def get_oscar_url_from_oscarlist(urlname): page = urllib2.urlopen(urlname) soup_name = BeautifulSoup(page) movie_url_list = [] for link in soup_name.find_all('a', href=re.compile('oscar/movies')): one_year_movies = ("http://boxofficemojo.com" + link.get('href').split('/oscar')[1]) movie_url_list.append(one_year_movies) return movie_url_list # subset url set to just get links since 2000 final_url = get_oscar_url_from_oscarlist(url)[:100] final_url[:10] nominations = [] wins = [] productions = [] title = [] for url in nominees_url: # print '----%s-----' % url page = urllib2.urlopen(url) soup = BeautifulSoup(page) raw_nominations = int(get_movie_value(soup, "Total Nominations:")) nominations.append(raw_nominations) # print raw_nominations raw_wins = int(get_movie_value(soup, "Total Wins:")) wins.append(raw_wins) # print raw_wins raw_production = get_movie_value(soup, "Prod. Budget:") if raw_production == None: productions.append(np.nan) continue production_num = raw_production.replace('$','').replace('million','') production_total = int(float(production_num)*1000000) productions.append(production_total) # print production_total title = [] directors = [] director_data = [] actors = [] actor_data = [] writers = [] writer_data = [] producers = [] producer_data = [] distributors = [] distributor_data = [] for url in final_url: page = urllib2.urlopen(url) soup = BeautifulSoup(page) title_string = soup.find('title').text raw_title = title_string.split('(')[0].strip() title.append(raw_title) director = soup.find(text=re.compile('Director')) if director is None: print None continue director = director.find_next() try: for row in director.find_all('a'): directors.append(row.text) # print raw_title, row.text director_dict= {"Title": raw_title, "Director": row.text.strip('*')} director_data.append(director_dict) except: print None actor = soup.find(text=re.compile('Actor')) if actor is None: print None continue actor = actor.find_next() try: for row in actor.find_all('a'): actors.append(row.text) # print raw_title, row.text actor_dict= {"Title": raw_title, "Actor": row.text.strip('*')} actor_data.append(actor_dict) except: print None writer = soup.find(text=re.compile('Writer')) if writer is None: print None continue writer = writer.find_next() try: for row in writer.find_all('a'): writers.append(row.text) # print raw_title, row.text writer_dict= {"Title": raw_title, "Writer": row.text.strip('*')} writer_data.append(writer_dict) except: print None producer = soup.find(text=re.compile('Producer')) if producer is None: print None continue producer = producer.find_next() try: for row in producer.find_all('a'): producers.append(row.text) # print raw_title, row.text producer_dict= {"Title": raw_title, "Producer": row.text.strip("(co-producer)")} producer_data.append(producer_dict) except: print None distributor = soup.find(text=re.compile('Distributor')) if distributor is None: print None continue distributor = distributor.find_next() try: for row in distributor.find_all('a'): distributors.append(row.text) # print raw_title, row.text distributor_dict= {"Title": raw_title, "Distributor": row.text} distributor_data.append(distributor_dict) except: print None # Convert above lists to dataframes and label them # Using pre 2014 years to not skew data all_wins=pd.DataFrame(wins[:8]) all_noms = pd.DataFrame(nominations[:8]) all_titles=pd.DataFrame(title[:8]) all_wins.columns=["Wins"] all_noms.columns=["Nominations"] all_titles.columns=["Title"] pre_14_wins=pd.DataFrame(wins[8:]) pre_14_noms = pd.DataFrame(nominations[8:]) pre_14_titles=pd.DataFrame(title[8:]) pre_14_wins.columns=["Wins"] pre_14_noms.columns=["Nominations"] pre_14_titles.columns=["Title"] all_titles #join above data frames df = pd.DataFrame(pre_14_noms.join(pre_14_wins)) original_df = pd.DataFrame(df.join(pre_14_titles)) original_df.head(15) #Original summary of all nominations vs wins original_df['Ones'] = 1.0 X = original_df[['Nominations', 'Ones']] Y = original_df['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() # Use seaborn to plot linear regression w/ 95% confidence bands sns.lmplot("Nominations", "Wins", original_df); #Convert dictionary to data frame and reorder columns actor_data_df = pd.DataFrame(actor_data) actor_data_df.reindex(columns=['Title', 'Actor']) # make new object that is groupby of actors just_actors = actor_data_df.groupby('Actor') top_actors = just_actors.count().sort('Title', ascending= False).head(8) top_actors #Get list of just actors above l = just_actors.count().sort('Title', ascending= False).head(8) top_actors_list = list(l.index) # print top_actors_list # gets movies that are just in above list and then drop the duplicates top_titles_actors_dups = actor_data_df[actor_data_df['Actor'].isin(top_actors_list)] top_titles_actors = top_titles_actors_dups.drop_duplicates('Title') top_titles_actors_list = list(top_titles_actors['Title']) # print top_titles_actors_list #make new df from original df and take out just the data that matches titles that top actors were in top_actor_movies = original_df[original_df['Title'].isin(top_titles_actors_list)] top_actor_movies # Looking at specific titles for an actor actor_data_df[actor_data_df['Actor']=='Benedict Cumberbatch'] #Graph Summary of nominations of top actor's films vs wins top_actor_movies['Ones'] = 1.0 X = top_actor_movies[['Nominations', 'Ones']] Y = top_actor_movies['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", top_actor_movies) #Convert dictionary to data frame and reorder columns direc_df = pd.DataFrame(director_data) direc_df.reindex(columns=['Title', 'Director']) # make new object that is groupby of directors top_direcs = direc_df.groupby('Director') # Find out who were the top 10 directors top_directors = top_direcs.count().sort('Title', ascending= False).head(10) top_directors direc_df[direc_df['Director']=='Ethan Coen'] #Get list of just directors above direc = top_direcs.count().sort('Title', ascending= False).head(10) top_directors_list = list(direc.index) # print top_directors_list # gets movies that are just in above list and then drop the duplicates top_titles_directors_dups = direc_df[direc_df['Director'].isin(top_directors_list)] top_titles_directors = top_titles_directors_dups.drop_duplicates('Title') top_titles_directors_list = list(top_titles_directors['Title']) # print top_titles_directors_list #make new df from original df and take out just the data that matches titles that top actors were in top_directors_movies = original_df[original_df['Title'].isin(top_titles_directors_list)] top_directors_movies #Graph Summary of nominations of top director's films vs wins top_directors_movies['Ones'] = 1.0 X = top_directors_movies[['Nominations', 'Ones']] Y = top_directors_movies['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", top_directors_movies) #Convert dictionary to data frame and reorder columns produce_df = pd.DataFrame(producer_data) produce_df.reindex(columns=['Title', 'Producer']) # make new object that is groupby of producers top_produces = produce_df.groupby('Producer') top_producers = top_produces.count().sort('Title', ascending= False).head(3) top_producers produce_df[produce_df['Producer']=='Graham King'] #Get list of just producers above prod = top_produces.count().sort('Title', ascending= False).head(3) top_producers_list = list(prod.index) # print top_producers_list # gets movies that are just in above list and then drop the duplicates top_titles_producers_dups = produce_df[produce_df['Producer'].isin(top_producers_list)] top_titles_producers = top_titles_producers_dups.drop_duplicates('Title') top_titles_producers_list = list(top_titles_producers['Title']) # print top_titles_producers_list #make new df from original df and take out just the data that matches titles that top actors were in top_producers_movies = original_df[original_df['Title'].isin(top_titles_producers_list)] top_producers_movies #Graph Summary of nominations of top producer's films vs wins top_producers_movies['Ones'] = 1.0 X = top_producers_movies[['Nominations', 'Ones']] Y = top_producers_movies['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", top_producers_movies) #Convert dictionary to data frame and reorder columns write_df = pd.DataFrame(writer_data) write_df.reindex(columns=['Title', 'Writer']) # make new object that is groupby of writers top_writes = write_df.groupby('Writer') top_writers = top_writes.count().sort('Title', ascending= False).head(5) top_writers write_df[write_df['Writer']=='Philippa Boyens'] #Get list of just writers above writ = top_writes.count().sort('Title', ascending= False).head(5) top_writers_list = list(writ.index) # print top_writers_list # gets movies that are just in above list and then drop the duplicates top_titles_writers_dups = write_df[write_df['Writer'].isin(top_writers_list)] top_titles_writers = top_titles_writers_dups.drop_duplicates('Title') top_titles_writers_list = list(top_titles_writers['Title']) # print top_titles_writers_list #make new df from original df and take out just the data that matches titles that top actors were in top_writers_movies = original_df[original_df['Title'].isin(top_titles_writers_list)] top_writers_movies #Graph Summary of nominations of top writer's films vs wins top_writers_movies['Ones'] = 1.0 X = top_writers_movies[['Nominations', 'Ones']] Y = top_writers_movies['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", top_writers_movies) #Convert dictionary to data frame and reorder columns distribute_df = pd.DataFrame(distributor_data) distribute_df.reindex(columns=['Title', 'Distributor']) # make new object that is groupby of writers top_distributes = distribute_df.groupby('Distributor') top_distributors = top_distributes.count().sort('Title', ascending= False).head(6) top_distributors distribute_df[distribute_df['Distributor']=='Universal'] #Get list of just distributors above writ = top_distributes.count().sort('Title', ascending= False).head(6) top_distributors_list = list(writ.index) # print top_distributors_list # gets movies that are just in above list and then drop the duplicates top_titles_distributors_dups = distribute_df[distribute_df['Distributor'].isin(top_distributors_list)] top_titles_distributors = top_titles_distributors_dups.drop_duplicates('Title') top_titles_distributors_list = list(top_titles_distributors['Title']) # print top_titles_distributors_list #make new df from original df and take out just the data that matches titles that top actors were in top_distributors_movies = original_df[original_df['Title'].isin(top_titles_distributors_list)] top_distributors_movies #Graph Summary of nominations of top writer's films vs wins top_distributors_movies['Ones'] = 1.0 X = top_distributors_movies[['Nominations', 'Ones']] Y = top_distributors_movies['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", top_distributors_movies) top_titles_actors_list top_titles_producers_list top_titles_distributors_list first = top_titles_actors_list second = top_titles_producers_list third = list(set(first) | set(second)) # len(first) # len(second) fourth = list(set(third) | set(top_titles_distributors_list)) fourth actor_producer_distrib = original_df[original_df['Title'].isin(fourth)] actor_producer_distrib #Graph Summary of nominations of top writer's films vs wins actor_producer_distrib['Ones'] = 1.0 # add ones column for statsmodels regression X = actor_producer_distrib[['Nominations', 'Ones']] Y = actor_producer_distrib['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", actor_producer_distrib) top_titles_actors_list top_titles_distributors_list actor_distributor_list = list(set(top_titles_actors_list) | set(top_titles_distributors_list)) actor_distributor = actor_producer_distrib = original_df[original_df['Title'].isin(actor_distributor_list)] #Graph Summary of nominations of top writer's films vs wins actor_distributor['Ones'] = 1.0 X = actor_distributor[['Nominations', 'Ones']] Y = actor_distributor['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", actor_distributor) top_titles_distributors_list top_titles_directors_list director_distributor_list = list(set(top_titles_directors_list) | set(top_titles_distributors_list)) director_distributor = original_df[original_df['Title'].isin(director_distributor_list)] #Graph Summary of nominations of top writer's films vs wins director_distributor['Ones'] = 1.0 X = director_distributor[['Nominations', 'Ones']] Y = director_distributor['Wins'] linmodel = sm.OLS(Y, X).fit() linmodel.summary() sns.lmplot("Nominations", "Wins", director_distributor)