# Import necessary packages for data analysis
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
from pandas.tools.plotting import scatter_matrix
from matplotlib import pyplot as plt
# Render plot graphs inline
%matplotlib inline
# Read CSV data into a Pandas dataframe
pd.options.display.mpl_style = 'default'
gross_data = pd.read_csv('./dumps/china-box-office-grosses.csv', header=None)
# Set headers on dataframe
headers = ['title', 'distributor', 'gross', 'date']
gross_data.columns = headers
# Convert values in date column to datetime type
gross_data.date = pd.to_datetime(gross_data.date)
gross_data.head()
title | distributor | gross | date | |
---|---|---|---|---|
0 | Transformers: Age of Extinction | HuaXia | 301000000 | 2014-06-27 |
1 | The Monkey King | NaN | 167840000 | 2014-01-31 |
2 | X-Men: Days of Future Past | HuaXia | 116490000 | 2014-05-23 |
3 | Captain America: The Winter Soldier | China Film | 115620000 | 2014-04-04 |
4 | Dad, Where Are We Going? | NaN | 111870000 | 2014-01-31 |
gross_data.describe()
gross | |
---|---|
count | 5.420000e+02 |
mean | 1.427500e+07 |
std | 2.782816e+07 |
min | 2.829000e+03 |
25% | 8.029298e+05 |
50% | 3.039082e+06 |
75% | 1.371898e+07 |
max | 3.010000e+08 |
fig = sm.qqplot(gross_data.gross, dist=stats.distributions.norm, line='s')
gross_data.groupby('distributor').describe()
gross | ||
---|---|---|
distributor | ||
Baoli Bona | count | 2.000000 |
mean | 7739391.500000 | |
std | 2251587.090324 | |
min | 6147279.000000 | |
25% | 6943335.250000 | |
50% | 7739391.500000 | |
75% | 8535447.750000 | |
max | 9331504.000000 | |
Beijing Chuangs | count | 2.000000 |
mean | 2221235.000000 | |
std | 947786.130513 | |
min | 1551049.000000 | |
25% | 1886142.000000 | |
50% | 2221235.000000 | |
75% | 2556328.000000 | |
max | 2891421.000000 | |
Beijing Jidongxing | count | 2.000000 |
mean | 758039.000000 | |
std | 428339.832199 | |
min | 455157.000000 | |
25% | 606598.000000 | |
50% | 758039.000000 | |
75% | 909480.000000 | |
max | 1060921.000000 | |
Beijing Shidai | count | 2.000000 |
mean | 1477969.000000 | |
std | 406604.783959 | |
min | 1190456.000000 | |
25% | 1334212.500000 | |
50% | 1477969.000000 | |
... | ... | ... |
Xiying Huayi | std | 91184.247861 |
min | 1418495.000000 | |
25% | 1450733.500000 | |
50% | 1482972.000000 | |
75% | 1515210.500000 | |
max | 1547449.000000 | |
Zijing | count | 1.000000 |
mean | 257931.000000 | |
std | NaN | |
min | 257931.000000 | |
25% | 257931.000000 | |
50% | 257931.000000 | |
75% | 257931.000000 | |
max | 257931.000000 | |
Zijing Sanlian | count | 3.000000 |
mean | 761744.333333 | |
std | 451443.828138 | |
min | 339156.000000 | |
25% | 523936.500000 | |
50% | 708717.000000 | |
75% | 973038.500000 | |
max | 1237360.000000 | |
Zinjing Cheng Sanlia | count | 1.000000 |
mean | 155205.000000 | |
std | NaN | |
min | 155205.000000 | |
25% | 155205.000000 | |
50% | 155205.000000 | |
75% | 155205.000000 | |
max | 155205.000000 |
280 rows × 1 columns
gd_over_time = gross_data.set_index(pd.DatetimeIndex(gross_data.date))
gd_over_time.drop('date', axis=1, inplace=True)
gd_over_time.head()
title | distributor | gross | |
---|---|---|---|
2014-06-27 | Transformers: Age of Extinction | HuaXia | 301000000 |
2014-01-31 | The Monkey King | NaN | 167840000 |
2014-05-23 | X-Men: Days of Future Past | HuaXia | 116490000 |
2014-04-04 | Captain America: The Winter Soldier | China Film | 115620000 |
2014-01-31 | Dad, Where Are We Going? | NaN | 111870000 |
gd_by_month = gd_over_time.resample("M")
gd_by_month.dropna(inplace=True)
gd_by_month.groupby(gd_by_month.index).sum().head()
gross | |
---|---|
2007-01-31 | 1201837.285714 |
2007-02-28 | 3411974.000000 |
2007-03-31 | 1208611.083333 |
2007-04-30 | 1148178.769231 |
2007-05-31 | 6040519.500000 |
plt.figure(figsize=(10,5))
plt.scatter(gd_by_month.index, gd_by_month.gross)
plt.xlabel('Date')
plt.ylabel('Total Gross')
plt.title('Box Office Grosses Over Time')
<matplotlib.text.Text at 0x10cf97410>