Image(filename='Takeaway.png') import numpy as np import pandas as pd import seaborn as sb import json,urllib2,string from itertools import product from matplotlib.collections import LineCollection import statsmodels.formula.api as smf from bs4 import BeautifulSoup from scipy.stats import chisquare from IPython.display import Image # This may take a bit of time revenue_df_list = list() for character in string.printable[1:36]: soup = BeautifulSoup(urllib2.urlopen('http://www.the-numbers.com/movies/letter/{0}'.format(character)).read()) archive = pd.read_html(soup.findAll('table')[0].encode('latin1'),header=0,flavor='bs4',infer_types=False)[0] archive.columns = ['Released','Movie','Genre','Budget','Revenue','Trailer'] archive['Released'] = pd.to_datetime(archive['Released'],format=u"%b\xa0%d,\xa0%Y",coerce=True) archive = archive.replace([u'\xa0',u'nan'],[np.nan,np.nan]) archive['Budget'] = archive['Budget'].dropna().str.replace('$','').apply(lambda x:int(x.replace(',',''))) archive['Revenue'] = archive['Revenue'].dropna().str.replace('$','').apply(lambda x:int(x.replace(',',''))) archive['Year'] = archive['Released'].apply(lambda x:x.year) archive.dropna(subset=['Movie'],inplace=True) revenue_df_list.append(archive) numbers_df = pd.concat(revenue_df_list) numbers_df.reset_index(inplace=True,drop=True) numbers_df.to_csv('revenue.csv',encoding='utf8') numbers_df = pd.read_csv('revenue.csv',encoding='utf8',index_col=0) numbers_df['Released'] = pd.to_datetime(numbers_df['Released'],unit='D') numbers_df.tail() cpi = pd.read_csv('cpi.csv',index_col='Year') inflator = dict(cpi.ix[2014,'Annual']/cpi['Annual']) movie_ids = json.loads(urllib2.urlopen('http://bechdeltest.com/api/v1/getAllMovieIds').read()) imdb_ids = [movie[u'imdbid'] for movie in movie_ids] print u"There are {0} movies in the Bechdel test corpus".format(len(imdb_ids)) json.loads(urllib2.urlopen('http://bechdeltest.com/api/v1/getMovieByImdbId?imdbid=0000091').read()) ratings = dict() exceptions = list() for num,imdb_id in enumerate(imdb_ids): try: if num in range(0,len(imdb_ids),int(round(len(imdb_ids)/10,0))): print imdb_id ratings[imdb_id] = json.loads(urllib2.urlopen('http://bechdeltest.com/api/v1/getMovieByImdbId?imdbid={0}'.format(imdb_id)).read()) except: exceptions.append(imdb_id) pass with open('bechdel.json', 'wb') as f: json.dump(ratings.values(), f) ratings_df = pd.read_json('bechdel.json') ratings_df['imdbid'] = ratings_df['imdbid'].dropna().apply(int) ratings_df = ratings_df.set_index('imdbid') ratings_df.dropna(subset=['title'],inplace=True) ratings_df2 = ratings_df[['rating','title']] ratings_df2.head() json.loads(urllib2.urlopen('http://www.omdbapi.com/?i=tt0000091').read()) imdb_data = dict() exceptions = list() for num,imdb_id in enumerate(imdb_ids): try: if num in range(0,len(imdb_ids),int(round(len(imdb_ids)/10,0))): print imdb_id imdb_data[imdb_id] = json.loads(urllib2.urlopen('http://www.omdbapi.com/?i=tt{0}'.format(imdb_id)).read()) except: exceptions.append(imdb_id) pass with open('imdb_data.json', 'wb') as f: json.dump(imdb_data.values(), f) # Read the data in imdb_df = pd.read_json('imdb_data.json') # Drop non-movies imdb_df = imdb_df[imdb_df['Type'] == 'movie'] # Convert to datetime objects imdb_df['Released'] = pd.to_datetime(imdb_df['Released'], format="%d %b %Y", unit='D', coerce=True) # Drop errant identifying characters in the ID field imdb_df['imdbID'] = imdb_df['imdbID'].str.slice(start=2) # Remove the " min" at the end of Runtime entries so we can convert to ints imdb_df['Runtime'] = imdb_df['Runtime'].str.slice(stop=-4).replace('',np.nan) # Some errant runtimes have "h" in them. Commented-out code below identifies them. #s = imdb_df['Runtime'].dropna() #s[s.str.contains('h')] # Manually recode these h-containing Runtimes to minutes imdb_df.ix[946,'Runtime'] = '169' imdb_df.ix[1192,'Runtime'] = '96' imdb_df.ix[1652,'Runtime'] = '80' imdb_df.ix[2337,'Runtime'] = '87' imdb_df.ix[3335,'Runtime'] = '62' # Blank out non-MPAA or minor ratings (NC-17, X) imdb_df['Rated'] = imdb_df['Rated'].replace(to_replace=['N/A','Not Rated','Approved','Unrated','TV-PG','TV-G','TV-14','TV-MA','NC-17','X'],value=np.nan) # Convert Release dateimte into new columns for year, month, and week imdb_df['Year'] = imdb_df['Released'].apply(lambda x:x.year) imdb_df['Month'] = imdb_df['Released'].apply(lambda x:x.month) imdb_df['Week'] = imdb_df['Released'].apply(lambda x:x.week) # Convert the series to float imdb_df['Runtime'] = imdb_df['Runtime'].apply(float) # Take the imdbVotes formatted as string containing "N/A" and comma-delimited thousands, convert to float imdb_df['imdbVotes'] = imdb_df['imdbVotes'].dropna().replace('N/A',np.nan).dropna().apply(lambda x:float(x.replace(',',''))) # Take the Metascore formatted as string containing "N/A", convert to float # Also divide by 10 to make effect sizes more comparable imdb_df['Metascore'] = imdb_df['Metascore'].dropna().replace('N/A',np.nan).dropna().apply(float)/10. # Take the imdbRating formatted as string containing "N/A", convert to float imdb_df['imdbRating'] = imdb_df['imdbRating'].dropna().replace('N/A',np.nan).dropna().apply(float) # Create a dummy variable for English language imdb_df['English'] = (imdb_df['Language'] == u'English').astype(int) imdb_df['USA'] = (imdb_df['Country'] == u'USA').astype(int) # Convert imdb_ID to int, set it as the index imdb_df['imdbID'] = imdb_df['imdbID'].dropna().apply(int) imdb_df = imdb_df.set_index('imdbID') df = imdb_df.join(ratings_df2,how='inner').reset_index() df = pd.merge(df,numbers_df,left_on=['Title','Year'],right_on=['Movie','Year']) df['Year'] = df['Released_x'].apply(lambda x:x.year) df['Adj_Revenue'] = df.apply(lambda x:x['Revenue']*inflator[x['Year']],axis=1) df['Adj_Budget'] = df.apply(lambda x:x['Budget']*inflator[x['Year']],axis=1) df.tail(2).T representative_df = pd.DataFrame() # Group data by scores representative_df['All Bechdel'] = ratings_df2.groupby('rating')['title'].agg(len) representative_df['Analysis Subset'] = df.groupby('rating')['title'].agg(len) # Convert values of data to fractions of total representative_df2 = representative_df.apply(lambda x: x/x.sum(),axis=0) # Make the plot representative_df2.plot(kind='barh') plt.legend(fontsize=15,loc='lower right') plt.yticks(plt.yticks()[0],["Fewer than two women", "Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test' ],fontsize=15) plt.xticks(fontsize=15) plt.ylabel('') plt.xlabel('Fraction of movies',fontsize=18) plt.title('Does the analyzed data differ from the original data?',fontsize=18) # The observed data are the counts of the number of movies in each of the different categories for the Analysis subset. # The expected data are the fractions of the number of movies in the original Bechdel data, # multiplied by the number of movies in the Analysis subset. chisquare(f_obs=representative_df['Analysis Subset'].as_matrix(), f_exp=representative_df2['All Bechdel'].as_matrix()*representative_df['Analysis Subset'].sum()) df['Profit'] = df['Adj_Revenue'] - df['Adj_Budget'] df['ROI'] = df['Profit']/df['Adj_Budget'] # Do some fancy re-indexing and groupby operations to get average scores and number of movies avg_bechdel = df.set_index('Released_x').groupby(lambda x:x.year)['rating'].agg(np.mean) num_movies = df.set_index('Released_x').groupby(lambda x:x.year)['rating'].agg(lambda x:np.log(len(x)+1)) # Changing line widths from: http://stackoverflow.com/questions/19862011/matplotlib-change-linewidth-on-line-segments-using-list coords = sorted(dict(avg_bechdel).items(),key=lambda x:x[0]) lines = [(start, end) for start, end in zip(coords[:-1], coords[1:])] lines = LineCollection(lines, linewidths=dict(num_movies).values()) # Plot the figure fig, ax = plt.subplots() ax.add_collection(lines) plt.autoscale() plt.xlim((1912,2020)) plt.xlabel('Time',fontsize=18) plt.ylabel('Avg. Bechdel score',fontsize=18) df_rating_ct = pd.crosstab(df['Year'],df['rating']) # Bakes in a lot here and is bad Pythonic form -- but I'm lazy # http://stackoverflow.com/questions/17764619/pandas-dataframe-group-year-index-by-decade # http://stackoverflow.com/questions/21247203/how-to-make-a-pandas-crosstab-with-percentages # http://stackoverflow.com/questions/9938130/plotting-stacked-barplots-on-a-panda-data-frame df_rating_ct.groupby((df_rating_ct.index//10)*10).sum().apply(lambda x: x/x.sum(),axis=1).ix[1930:].plot(kind='bar',stacked=True) # Fix up the plot plt.ylabel('Percentage of movies',fontsize=18) plt.yticks(np.arange(0,1.1,.10),np.arange(0,110,10),fontsize=15) plt.xlabel('Decade',fontsize=18) plt.xticks(rotation=0,fontsize=15) plt.legend(loc='center',bbox_to_anchor=(1.1,.5),fontsize=15) avg_bechdel = df.set_index('Released_x').groupby(lambda x:x.year)['rating'].agg(np.mean) avg_bechdel.index.name = 'date' avg_bechdel.name = 'bechdel' avg_bechdel = avg_bechdel.reset_index() l_model = smf.ols(formula='bechdel ~ date', data=avg_bechdel).fit() #print l_model.summary() # Create a DataFrame "X" that we can extrapolate into X = pd.DataFrame({"date": np.linspace(start=1912.,stop=2100.,num=100)}) # Plot the observed data plt.plot(avg_bechdel['date'],avg_bechdel['bechdel'],c='b',label='Observed') # Plot the predictions from the model plt.plot(X,l_model.predict(X),c='r',label='Linear model',lw=4) plt.legend(loc='lower right') plt.ylim((0,3)) plt.xlim((1912,2100)) plt.xlabel('Year') plt.ylabel('Avg. Bechdel Test') plt.xlabel('Year',fontsize=18) plt.ylabel('Avg. Bechdel Test',fontsize=18) import datetime year_est = (3 - l_model.params['Intercept'])/l_model.params['date'] year_int = int(year_est) d = datetime.timedelta(days=(year_est - year_int)*365.25) day_one = datetime.datetime(year_int,1,1) date = d + day_one print date, date.weekday() q_model = smf.ols(formula='bechdel ~ date + I(date**2)', data=avg_bechdel).fit() # Create a DataFrame "X" that we can extrapolate into X = pd.DataFrame({"date": np.linspace(start=1912.,stop=2112.,num=201)}) # Plot the observed data plt.plot(avg_bechdel['date'],avg_bechdel['bechdel'],c='b',label='Observed') # Plot the predictions from the model plt.plot(X,q_model.predict(X),c='r',label='Quadratic model',lw=4) plt.legend(loc='upper right') plt.ylim((0,3)) plt.xlim((1912,2112)) plt.xlabel('Year',fontsize=18) plt.ylabel('Avg. Bechdel Test',fontsize=18) x = df['rating'].apply(lambda x:float(x)+np.random.normal(0, 0.05)) y = df['Adj_Budget'] # Plot with an alpha so the overlaps reveal something about relative density plt.scatter(x,y,alpha=.2,label='Data',c='g') plt.ylabel('Budgets (2014 $)',fontsize=18) plt.xlabel('Bechdel dimensions',fontsize=18) plt.xticks(np.arange(0,4)) plt.yscale('log') plt.grid(False,which='minor') #plt.ylim((2e0,1e10)) df['Adj_Budget'].groupby(df['rating']).agg(np.median).plot(kind='barh') plt.xticks(arange(0e7,7e7,1e7),arange(0,70,10),fontsize=15) plt.yticks(plt.yticks()[0],["Fewer than two women", "Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test' ],fontsize=18) plt.xlabel('Budget (2014 $millions)',fontsize=18) plt.title('Median Budget for Films',fontsize=24) plt.ylabel('') bd_m = smf.ols(formula='log(Adj_Budget+1) ~ C(rating)', data=df).fit() print bd_m.summary() print "\n\nMovies passing every Bechdel dimension have budgets of ${:,} on average.".format(round(np.exp(bd_m.params['Intercept']+bd_m.params['C(rating)[T.3.0]']-1),2)) print "Movies failing every Bechdel dimension have budgets of ${:,} on average.".format(round(np.exp(bd_m.params['Intercept']-1),2)) df['ROI'].groupby(df['rating']).agg(np.median).plot(kind='barh') plt.yticks(plt.yticks()[0],["Fewer than two women", "Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test' ],fontsize=18) plt.xlabel('Return on Investment',fontsize=18) plt.title('Median ROI for Films',fontsize=24) plt.ylabel('') # Estimate the model ed_m1 = smf.ols(formula='ROI ~ C(rating) + log(Adj_Budget+1)', data=df).fit() print ed_m1.summary() df['Profit'].groupby(df['rating']).agg(np.median).plot(kind='barh') plt.yticks(plt.yticks()[0],["Fewer than two women", "Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test' ],fontsize=18) plt.xlabel('Gross Profit',fontsize=18) plt.title('Median ROI for Films',fontsize=24) plt.ylabel('') # Estimate the model ed_m2 = smf.ols(formula='Profit ~ C(rating) + log(Budget+1)', data=df).fit() print ed_m2.summary() # Make the bar-plot df['Adj_Revenue'].groupby(df['rating']).agg(np.median).plot(kind='barh') plt.yticks(plt.yticks()[0],["Fewer than two women", "Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test' ],fontsize=18) plt.xlabel('Return on Investment',fontsize=18) plt.title('Revenue for Films',fontsize=24) plt.ylabel('') # Estimate the model ed_m3 = smf.ols(formula='log(Adj_Revenue+1) ~ C(rating) + log(Adj_Budget+1)', data=df).fit() print ed_m3.summary() # Estimate model without the Bechdel test IV and containing only the IVs to get residuals ed_m3_bad_res = smf.ols(formula='log(Adj_Revenue+1) ~ log(Adj_Budget+1)', data=df[df['Adj_Revenue']>0]).fit() # Put these residual estimates in a temporary DataFrame df_temp = df df_temp['residuals'] = ed_m3_bad_res.resid # Estimate a bad model with a continuous Bechdel rating against the residuals ed_m3_bad = smf.ols(formula='residuals ~ rating', data=df_temp).fit() ed_m3_bad_x = pd.DataFrame({'rating':np.linspace(min(df['rating']),max(df['rating']),100)}) # Add a jitter so the points aren't overlapping x = df['rating'].apply(lambda x:float(x)+np.random.normal(0, 0.05)) y = df_temp['residuals'] plt.plot(ed_m3_bad_x['rating'],ed_m3_bad.predict(ed_m3_bad_x),c='r',lw=2,label='Model') # Plot with an alpha so the overlaps reveal something about relative density plt.scatter(x,y,alpha=.2,label='Data',c='g') plt.xticks(arange(0,4)) plt.autoscale() plt.xlabel('Bechdel score',fontsize=18) plt.ylabel('Residual Adj. Revenue',fontsize=18) plt.ylim((-5,5)) plt.legend(loc='lower left') # Make the bar-plot df['Metascore'].groupby(df['rating']).agg(np.mean).plot(kind='barh') plt.yticks(plt.yticks()[0],["Fewer than two women", "Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test' ],fontsize=18) plt.xlabel('Rating',fontsize=18) plt.title('Mean Metascore',fontsize=24) plt.xlim((5,6)) plt.ylabel('') m2 = smf.ols(formula='Metascore ~ C(rating) + log(Adj_Budget+1)', data=df).fit() print m2.summary() # Make the bar-plot df['imdbRating'].groupby(df['rating']).agg(np.mean).plot(kind='barh') plt.yticks(plt.yticks()[0],["Fewer than two women", "Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test' ],fontsize=18) plt.xlabel('Rating',fontsize=18) plt.title('Mean IMDB rating',fontsize=24) plt.xlim((6,7)) plt.ylabel('') m3 = smf.ols(formula='imdbRating ~ C(rating) + log(Adj_Budget+1) + log(imdbVotes+1)', data=df).fit() print m3.summary() Image(url='http://i1.ytimg.com/vi/EYiZeszLosE/maxresdefault.jpg',height=200) m4 = smf.ols(formula='imdbRating ~ C(rating) + log(Adj_Revenue+1) + log(Adj_Budget+1) + Metascore + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit() print m4.summary() m5 = smf.ols(formula='Metascore ~ C(rating) + log(Adj_Revenue+1) + log(Adj_Budget+1) + imdbRating + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit() m6 = smf.ols(formula='log(Adj_Revenue+1) ~ C(rating) + Metascore + log(Adj_Budget+1) + imdbRating + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit() m7 = smf.ols(formula='log(Adj_Budget+1) ~ C(rating) + Metascore + log(Adj_Revenue+1) + imdbRating + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit() print "Difference in IMDB scores between movies that pass all and fail all requirements of the Bechdel test: {0}.".format(round(m4.params['C(rating)[T.3.0]'],2)) print "Difference in Metascores between movies that pass all and fail all requirements of the Bechdel test: {0}.".format(round(m5.params['C(rating)[T.3.0]']*10,2)) print "Difference in revenue between movies that pass all and fail all requirements of the Bechdel test: {0}%.".format(round(100*(np.exp(m6.params['C(rating)[T.3.0]']) - 1),2)) print "Difference in budget between movies that pass all and fail all requirements of the Bechdel test: {0}%.".format(round(100*(np.exp(m7.params['C(rating)[T.3.0]']) - 1),2)) results_df = pd.DataFrame({'m4_params':m4.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']], 'm4_pvalues':m4.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']], 'm5_params':m5.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']], 'm5_pvalues':m5.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']], 'm6_params':m6.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']], 'm6_pvalues':m6.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']], 'm7_params':m7.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']], 'm7_pvalues':m7.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']] }) results_df.index = ["Women don't talk to each other", 'Women only talk about men', 'Passes Bechdel Test'] ax = results_df[['m4_params','m5_params','m6_params','m7_params']].T.plot(kind='barh') plt.legend(bbox_to_anchor=(1,1),fontsize=12) # Change the alpha channel to correspond with the p-values # More transparent bars are less significant results # http://stackoverflow.com/questions/22888607/changing-alpha-values-in-pandas-barplot-to-match-variable/ for i, (j, k) in enumerate(product(range(3), range(3))): patch = ax.patches[i] alpha = 1 - results_df[['m4_pvalues','m5_pvalues','m6_pvalues','m7_pvalues']].T.iloc[j, k] patch.set_alpha(max(alpha, .2)) plt.yticks(plt.yticks()[0],['IMDB Ratings','Metascore','Revenue','Budget'],fontsize=18) plt.xlabel('Effect size compared to movies failing\nall elements of the Bechdel test',fontsize=18) plt.title("Effects of women's roles in movies",fontsize=24) plt.xticks(fontsize=15) plt.autoscale()