Image(filename='Takeaway.png')

import numpy as np
import pandas as pd
import seaborn as sb
import json,urllib2,string
from itertools import product
from matplotlib.collections import LineCollection
import statsmodels.formula.api as smf
from bs4 import BeautifulSoup
from scipy.stats import chisquare
from IPython.display import Image

# This may take a bit of time
revenue_df_list = list()

for character in string.printable[1:36]:
    soup = BeautifulSoup(urllib2.urlopen('http://www.the-numbers.com/movies/letter/{0}'.format(character)).read())
    archive = pd.read_html(soup.findAll('table')[0].encode('latin1'),header=0,flavor='bs4',infer_types=False)[0]
    archive.columns = ['Released','Movie','Genre','Budget','Revenue','Trailer']
    archive['Released'] = pd.to_datetime(archive['Released'],format=u"%b\xa0%d,\xa0%Y",coerce=True)
    archive = archive.replace([u'\xa0',u'nan'],[np.nan,np.nan])
    archive['Budget'] = archive['Budget'].dropna().str.replace('$','').apply(lambda x:int(x.replace(',','')))
    archive['Revenue'] = archive['Revenue'].dropna().str.replace('$','').apply(lambda x:int(x.replace(',','')))
    archive['Year'] = archive['Released'].apply(lambda x:x.year)
    archive.dropna(subset=['Movie'],inplace=True)
    revenue_df_list.append(archive)
    
numbers_df = pd.concat(revenue_df_list)
numbers_df.reset_index(inplace=True,drop=True)
numbers_df.to_csv('revenue.csv',encoding='utf8')

numbers_df = pd.read_csv('revenue.csv',encoding='utf8',index_col=0)
numbers_df['Released'] = pd.to_datetime(numbers_df['Released'],unit='D')
numbers_df.tail()

cpi = pd.read_csv('cpi.csv',index_col='Year')
inflator = dict(cpi.ix[2014,'Annual']/cpi['Annual'])

movie_ids = json.loads(urllib2.urlopen('http://bechdeltest.com/api/v1/getAllMovieIds').read())
imdb_ids = [movie[u'imdbid'] for movie in movie_ids]
print u"There are {0} movies in the Bechdel test corpus".format(len(imdb_ids))

json.loads(urllib2.urlopen('http://bechdeltest.com/api/v1/getMovieByImdbId?imdbid=0000091').read())

ratings = dict()
exceptions = list()
for num,imdb_id in enumerate(imdb_ids):
    try:
        if num in range(0,len(imdb_ids),int(round(len(imdb_ids)/10,0))):
            print imdb_id
        ratings[imdb_id] = json.loads(urllib2.urlopen('http://bechdeltest.com/api/v1/getMovieByImdbId?imdbid={0}'.format(imdb_id)).read())
    except:
        exceptions.append(imdb_id)
        pass

with open('bechdel.json', 'wb') as f:
  json.dump(ratings.values(), f)

ratings_df = pd.read_json('bechdel.json')
ratings_df['imdbid'] = ratings_df['imdbid'].dropna().apply(int)
ratings_df = ratings_df.set_index('imdbid')

ratings_df.dropna(subset=['title'],inplace=True)
ratings_df2 = ratings_df[['rating','title']]
ratings_df2.head()

json.loads(urllib2.urlopen('http://www.omdbapi.com/?i=tt0000091').read())

imdb_data = dict()
exceptions = list()
for num,imdb_id in enumerate(imdb_ids):
    try:
        if num in range(0,len(imdb_ids),int(round(len(imdb_ids)/10,0))):
            print imdb_id
        imdb_data[imdb_id] = json.loads(urllib2.urlopen('http://www.omdbapi.com/?i=tt{0}'.format(imdb_id)).read())
    except:
        exceptions.append(imdb_id)
        pass

with open('imdb_data.json', 'wb') as f:
  json.dump(imdb_data.values(), f)

# Read the data in
imdb_df = pd.read_json('imdb_data.json')

# Drop non-movies
imdb_df = imdb_df[imdb_df['Type'] == 'movie']

# Convert to datetime objects
imdb_df['Released'] = pd.to_datetime(imdb_df['Released'], format="%d %b %Y", unit='D', coerce=True)

# Drop errant identifying characters in the ID field
imdb_df['imdbID'] = imdb_df['imdbID'].str.slice(start=2)

# Remove the " min" at the end of Runtime entries so we can convert to ints
imdb_df['Runtime'] = imdb_df['Runtime'].str.slice(stop=-4).replace('',np.nan)

# Some errant runtimes have "h" in them. Commented-out code below identifies them.
#s = imdb_df['Runtime'].dropna()
#s[s.str.contains('h')]

# Manually recode these h-containing Runtimes to minutes
imdb_df.ix[946,'Runtime'] = '169'
imdb_df.ix[1192,'Runtime'] = '96'
imdb_df.ix[1652,'Runtime'] = '80'
imdb_df.ix[2337,'Runtime'] = '87'
imdb_df.ix[3335,'Runtime'] = '62'

# Blank out non-MPAA or minor ratings (NC-17, X)
imdb_df['Rated'] = imdb_df['Rated'].replace(to_replace=['N/A','Not Rated','Approved','Unrated','TV-PG','TV-G','TV-14','TV-MA','NC-17','X'],value=np.nan)

# Convert Release dateimte into new columns for year, month, and week
imdb_df['Year'] = imdb_df['Released'].apply(lambda x:x.year)
imdb_df['Month'] = imdb_df['Released'].apply(lambda x:x.month)
imdb_df['Week'] = imdb_df['Released'].apply(lambda x:x.week)

# Convert the series to float
imdb_df['Runtime'] = imdb_df['Runtime'].apply(float)

# Take the imdbVotes formatted as string containing "N/A" and comma-delimited thousands, convert to float
imdb_df['imdbVotes'] = imdb_df['imdbVotes'].dropna().replace('N/A',np.nan).dropna().apply(lambda x:float(x.replace(',','')))

# Take the Metascore formatted as string containing "N/A", convert to float
# Also divide by 10 to make effect sizes more comparable
imdb_df['Metascore'] = imdb_df['Metascore'].dropna().replace('N/A',np.nan).dropna().apply(float)/10.

# Take the imdbRating formatted as string containing "N/A", convert to float
imdb_df['imdbRating'] = imdb_df['imdbRating'].dropna().replace('N/A',np.nan).dropna().apply(float)

# Create a dummy variable for English language
imdb_df['English'] = (imdb_df['Language']  == u'English').astype(int)
imdb_df['USA'] = (imdb_df['Country']  == u'USA').astype(int)

# Convert imdb_ID to int, set it as the index
imdb_df['imdbID'] = imdb_df['imdbID'].dropna().apply(int)
imdb_df = imdb_df.set_index('imdbID')


df = imdb_df.join(ratings_df2,how='inner').reset_index()
df = pd.merge(df,numbers_df,left_on=['Title','Year'],right_on=['Movie','Year'])
df['Year'] = df['Released_x'].apply(lambda x:x.year)
df['Adj_Revenue'] = df.apply(lambda x:x['Revenue']*inflator[x['Year']],axis=1)
df['Adj_Budget'] = df.apply(lambda x:x['Budget']*inflator[x['Year']],axis=1)

df.tail(2).T

representative_df = pd.DataFrame()

# Group data by scores
representative_df['All Bechdel'] = ratings_df2.groupby('rating')['title'].agg(len)
representative_df['Analysis Subset'] = df.groupby('rating')['title'].agg(len)

# Convert values of data to fractions of total
representative_df2 = representative_df.apply(lambda x: x/x.sum(),axis=0)

# Make the plot
representative_df2.plot(kind='barh')
plt.legend(fontsize=15,loc='lower right')
plt.yticks(plt.yticks()[0],["Fewer than two women",
                             "Women don't talk to each other",
                             'Women only talk about men',
                             'Passes Bechdel Test'
                             ],fontsize=15)
plt.xticks(fontsize=15)
plt.ylabel('')
plt.xlabel('Fraction of movies',fontsize=18)
plt.title('Does the analyzed data differ from the original data?',fontsize=18)

# The observed data are the counts of the number of movies in each of the different categories for the Analysis subset.
# The expected data are the fractions of the number of movies in the original Bechdel data,
# multiplied by the number of movies in the Analysis subset.
chisquare(f_obs=representative_df['Analysis Subset'].as_matrix(),
          f_exp=representative_df2['All Bechdel'].as_matrix()*representative_df['Analysis Subset'].sum())

df['Profit'] = df['Adj_Revenue'] - df['Adj_Budget']
df['ROI'] = df['Profit']/df['Adj_Budget']

# Do some fancy re-indexing and groupby operations to get average scores and number of movies
avg_bechdel = df.set_index('Released_x').groupby(lambda x:x.year)['rating'].agg(np.mean)
num_movies = df.set_index('Released_x').groupby(lambda x:x.year)['rating'].agg(lambda x:np.log(len(x)+1))

# Changing line widths from: http://stackoverflow.com/questions/19862011/matplotlib-change-linewidth-on-line-segments-using-list
coords = sorted(dict(avg_bechdel).items(),key=lambda x:x[0])
lines = [(start, end) for start, end in zip(coords[:-1], coords[1:])]
lines = LineCollection(lines, linewidths=dict(num_movies).values())

# Plot the figure
fig, ax = plt.subplots()
ax.add_collection(lines)
plt.autoscale()
plt.xlim((1912,2020))
plt.xlabel('Time',fontsize=18)
plt.ylabel('Avg. Bechdel score',fontsize=18)


df_rating_ct = pd.crosstab(df['Year'],df['rating'])

# Bakes in a lot here and is bad Pythonic form -- but I'm lazy
# http://stackoverflow.com/questions/17764619/pandas-dataframe-group-year-index-by-decade
# http://stackoverflow.com/questions/21247203/how-to-make-a-pandas-crosstab-with-percentages
# http://stackoverflow.com/questions/9938130/plotting-stacked-barplots-on-a-panda-data-frame

df_rating_ct.groupby((df_rating_ct.index//10)*10).sum().apply(lambda x: x/x.sum(),axis=1).ix[1930:].plot(kind='bar',stacked=True)

# Fix up the plot
plt.ylabel('Percentage of movies',fontsize=18)
plt.yticks(np.arange(0,1.1,.10),np.arange(0,110,10),fontsize=15)
plt.xlabel('Decade',fontsize=18)
plt.xticks(rotation=0,fontsize=15)
plt.legend(loc='center',bbox_to_anchor=(1.1,.5),fontsize=15)

avg_bechdel = df.set_index('Released_x').groupby(lambda x:x.year)['rating'].agg(np.mean)
avg_bechdel.index.name = 'date'
avg_bechdel.name = 'bechdel'
avg_bechdel = avg_bechdel.reset_index()

l_model = smf.ols(formula='bechdel ~ date', data=avg_bechdel).fit()
#print l_model.summary()

# Create a DataFrame "X" that we can extrapolate into
X = pd.DataFrame({"date": np.linspace(start=1912.,stop=2100.,num=100)})

# Plot the observed data
plt.plot(avg_bechdel['date'],avg_bechdel['bechdel'],c='b',label='Observed')

# Plot the predictions from the model
plt.plot(X,l_model.predict(X),c='r',label='Linear model',lw=4)
plt.legend(loc='lower right')
plt.ylim((0,3))
plt.xlim((1912,2100))
plt.xlabel('Year')
plt.ylabel('Avg. Bechdel Test')

plt.xlabel('Year',fontsize=18)
plt.ylabel('Avg. Bechdel Test',fontsize=18)

import datetime
year_est = (3 - l_model.params['Intercept'])/l_model.params['date']

year_int = int(year_est)

d = datetime.timedelta(days=(year_est - year_int)*365.25)

day_one = datetime.datetime(year_int,1,1)

date = d + day_one

print date, date.weekday()

q_model = smf.ols(formula='bechdel ~ date + I(date**2)', data=avg_bechdel).fit()

# Create a DataFrame "X" that we can extrapolate into
X = pd.DataFrame({"date": np.linspace(start=1912.,stop=2112.,num=201)})

# Plot the observed data
plt.plot(avg_bechdel['date'],avg_bechdel['bechdel'],c='b',label='Observed')

# Plot the predictions from the model
plt.plot(X,q_model.predict(X),c='r',label='Quadratic model',lw=4)
plt.legend(loc='upper right')
plt.ylim((0,3))
plt.xlim((1912,2112))
plt.xlabel('Year',fontsize=18)
plt.ylabel('Avg. Bechdel Test',fontsize=18)

x = df['rating'].apply(lambda x:float(x)+np.random.normal(0, 0.05))
y = df['Adj_Budget']

# Plot with an alpha so the overlaps reveal something about relative density
plt.scatter(x,y,alpha=.2,label='Data',c='g')
plt.ylabel('Budgets (2014 $)',fontsize=18)
plt.xlabel('Bechdel dimensions',fontsize=18)
plt.xticks(np.arange(0,4))
plt.yscale('log')
plt.grid(False,which='minor')
#plt.ylim((2e0,1e10))

df['Adj_Budget'].groupby(df['rating']).agg(np.median).plot(kind='barh')
plt.xticks(arange(0e7,7e7,1e7),arange(0,70,10),fontsize=15)
plt.yticks(plt.yticks()[0],["Fewer than two women",
                             "Women don't talk to each other",
                             'Women only talk about men',
                             'Passes Bechdel Test'
                             ],fontsize=18)
plt.xlabel('Budget (2014 $millions)',fontsize=18)
plt.title('Median Budget for Films',fontsize=24)
plt.ylabel('')

bd_m = smf.ols(formula='log(Adj_Budget+1) ~ C(rating)', data=df).fit()
print bd_m.summary()

print "\n\nMovies passing every Bechdel dimension have budgets of ${:,} on average.".format(round(np.exp(bd_m.params['Intercept']+bd_m.params['C(rating)[T.3.0]']-1),2))
print "Movies failing every Bechdel dimension have budgets of ${:,} on average.".format(round(np.exp(bd_m.params['Intercept']-1),2))

df['ROI'].groupby(df['rating']).agg(np.median).plot(kind='barh')
plt.yticks(plt.yticks()[0],["Fewer than two women",
                             "Women don't talk to each other",
                             'Women only talk about men',
                             'Passes Bechdel Test'
                             ],fontsize=18)
plt.xlabel('Return on Investment',fontsize=18)
plt.title('Median ROI for Films',fontsize=24)
plt.ylabel('')

# Estimate the model
ed_m1 = smf.ols(formula='ROI ~ C(rating)  + log(Adj_Budget+1)', data=df).fit()
print ed_m1.summary()

df['Profit'].groupby(df['rating']).agg(np.median).plot(kind='barh')
plt.yticks(plt.yticks()[0],["Fewer than two women",
                             "Women don't talk to each other",
                             'Women only talk about men',
                             'Passes Bechdel Test'
                             ],fontsize=18)
plt.xlabel('Gross Profit',fontsize=18)
plt.title('Median ROI for Films',fontsize=24)
plt.ylabel('')

# Estimate the model
ed_m2 = smf.ols(formula='Profit ~ C(rating)  + log(Budget+1)', data=df).fit()
print ed_m2.summary()

# Make the bar-plot
df['Adj_Revenue'].groupby(df['rating']).agg(np.median).plot(kind='barh')

plt.yticks(plt.yticks()[0],["Fewer than two women",
                             "Women don't talk to each other",
                             'Women only talk about men',
                             'Passes Bechdel Test'
                             ],fontsize=18)
plt.xlabel('Return on Investment',fontsize=18)
plt.title('Revenue for Films',fontsize=24)

plt.ylabel('')

# Estimate the model
ed_m3 = smf.ols(formula='log(Adj_Revenue+1) ~ C(rating)  + log(Adj_Budget+1)', data=df).fit()
print ed_m3.summary()

# Estimate model without the Bechdel test IV and containing only the IVs to get residuals
ed_m3_bad_res = smf.ols(formula='log(Adj_Revenue+1) ~ log(Adj_Budget+1)', data=df[df['Adj_Revenue']>0]).fit()

# Put these residual estimates in a temporary DataFrame
df_temp = df
df_temp['residuals'] = ed_m3_bad_res.resid

# Estimate a bad model with a continuous Bechdel rating against the residuals
ed_m3_bad = smf.ols(formula='residuals ~ rating', data=df_temp).fit()
ed_m3_bad_x = pd.DataFrame({'rating':np.linspace(min(df['rating']),max(df['rating']),100)})

# Add a jitter so the points aren't overlapping
x = df['rating'].apply(lambda x:float(x)+np.random.normal(0, 0.05))
y = df_temp['residuals']

plt.plot(ed_m3_bad_x['rating'],ed_m3_bad.predict(ed_m3_bad_x),c='r',lw=2,label='Model')

# Plot with an alpha so the overlaps reveal something about relative density
plt.scatter(x,y,alpha=.2,label='Data',c='g')
plt.xticks(arange(0,4))
plt.autoscale()
plt.xlabel('Bechdel score',fontsize=18)
plt.ylabel('Residual Adj. Revenue',fontsize=18)
plt.ylim((-5,5))
plt.legend(loc='lower left')


# Make the bar-plot
df['Metascore'].groupby(df['rating']).agg(np.mean).plot(kind='barh')
plt.yticks(plt.yticks()[0],["Fewer than two women",
                             "Women don't talk to each other",
                             'Women only talk about men',
                             'Passes Bechdel Test'
                             ],fontsize=18)
plt.xlabel('Rating',fontsize=18)
plt.title('Mean Metascore',fontsize=24)
plt.xlim((5,6))
plt.ylabel('')

m2 = smf.ols(formula='Metascore ~ C(rating) + log(Adj_Budget+1)', data=df).fit()
print m2.summary()

# Make the bar-plot
df['imdbRating'].groupby(df['rating']).agg(np.mean).plot(kind='barh')
plt.yticks(plt.yticks()[0],["Fewer than two women",
                             "Women don't talk to each other",
                             'Women only talk about men',
                             'Passes Bechdel Test'
                             ],fontsize=18)
plt.xlabel('Rating',fontsize=18)
plt.title('Mean IMDB rating',fontsize=24)
plt.xlim((6,7))
plt.ylabel('')

m3 = smf.ols(formula='imdbRating ~ C(rating) + log(Adj_Budget+1) + log(imdbVotes+1)', data=df).fit()
print m3.summary()

Image(url='http://i1.ytimg.com/vi/EYiZeszLosE/maxresdefault.jpg',height=200)

m4 = smf.ols(formula='imdbRating ~ C(rating) + log(Adj_Revenue+1) + log(Adj_Budget+1) + Metascore + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit()
print m4.summary()

m5 = smf.ols(formula='Metascore ~ C(rating) + log(Adj_Revenue+1) + log(Adj_Budget+1) + imdbRating + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit()
m6 = smf.ols(formula='log(Adj_Revenue+1) ~ C(rating) + Metascore + log(Adj_Budget+1) + imdbRating + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit()
m7 = smf.ols(formula='log(Adj_Budget+1) ~ C(rating) + Metascore + log(Adj_Revenue+1) + imdbRating + log(imdbVotes+1) + Runtime + C(Genre_y) + Year + C(Week) + C(English) + C(USA) + C(Rated)', data=df).fit()

print "Difference in IMDB scores between movies that pass all and fail all requirements of the Bechdel test: {0}.".format(round(m4.params['C(rating)[T.3.0]'],2))
print "Difference in Metascores between movies that pass all and fail all requirements of the Bechdel test: {0}.".format(round(m5.params['C(rating)[T.3.0]']*10,2))
print "Difference in revenue between movies that pass all and fail all requirements of the Bechdel test: {0}%.".format(round(100*(np.exp(m6.params['C(rating)[T.3.0]']) - 1),2))
print "Difference in budget between movies that pass all and fail all requirements of the Bechdel test: {0}%.".format(round(100*(np.exp(m7.params['C(rating)[T.3.0]']) - 1),2))

results_df = pd.DataFrame({'m4_params':m4.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']],
                           'm4_pvalues':m4.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']],
                           'm5_params':m5.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']],
                           'm5_pvalues':m5.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']],
                           'm6_params':m6.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']],
                           'm6_pvalues':m6.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']],
                           'm7_params':m7.params[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']],
                           'm7_pvalues':m7.pvalues[['C(rating)[T.1.0]','C(rating)[T.2.0]','C(rating)[T.3.0]']]
                           })
results_df.index = ["Women don't talk to each other",
                    'Women only talk about men',
                    'Passes Bechdel Test']

ax = results_df[['m4_params','m5_params','m6_params','m7_params']].T.plot(kind='barh')
plt.legend(bbox_to_anchor=(1,1),fontsize=12)

# Change the alpha channel to correspond with the p-values
# More transparent bars are less significant results
# http://stackoverflow.com/questions/22888607/changing-alpha-values-in-pandas-barplot-to-match-variable/
for i, (j, k) in enumerate(product(range(3), range(3))):
    patch = ax.patches[i]
    alpha = 1 - results_df[['m4_pvalues','m5_pvalues','m6_pvalues','m7_pvalues']].T.iloc[j, k]
    patch.set_alpha(max(alpha, .2))
    
plt.yticks(plt.yticks()[0],['IMDB Ratings','Metascore','Revenue','Budget'],fontsize=18)
plt.xlabel('Effect size compared to movies failing\nall elements of the Bechdel test',fontsize=18)
plt.title("Effects of women's roles in movies",fontsize=24)
plt.xticks(fontsize=15)
plt.autoscale()