In [2]:

# Import necessary packages for data analysis

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

from pandas.tools.plotting import scatter_matrix
from matplotlib import pyplot as plt

In [3]:

# Render plot graphs inline

%matplotlib inline

In [4]:

# Read CSV data into a Pandas dataframe

pd.options.display.mpl_style = 'default'
gross_data = pd.read_csv('./dumps/china-box-office-grosses.csv', header=None)

In [5]:

# Set headers on dataframe

headers = ['title', 'distributor', 'gross', 'date']
gross_data.columns = headers

In [6]:

# Convert values in date column to datetime type

gross_data.date = pd.to_datetime(gross_data.date)

In [7]:

gross_data.head()

Out[7]:

	title	distributor	gross	date
0	Transformers: Age of Extinction	HuaXia	301000000	2014-06-27
1	The Monkey King	NaN	167840000	2014-01-31
2	X-Men: Days of Future Past	HuaXia	116490000	2014-05-23
3	Captain America: The Winter Soldier	China Film	115620000	2014-04-04
4	Dad, Where Are We Going?	NaN	111870000	2014-01-31

In [8]:

gross_data.describe()

Out[8]:

	gross
count	5.420000e+02
mean	1.427500e+07
std	2.782816e+07
min	2.829000e+03
25%	8.029298e+05
50%	3.039082e+06
75%	1.371898e+07
max	3.010000e+08

In [9]:

fig = sm.qqplot(gross_data.gross, dist=stats.distributions.norm, line='s')

In [10]:

gross_data.groupby('distributor').describe()

Out[10]:

		gross
distributor
Baoli Bona	count	2.000000
	mean	7739391.500000
	std	2251587.090324
	min	6147279.000000
	25%	6943335.250000
	50%	7739391.500000
	75%	8535447.750000
	max	9331504.000000
Beijing Chuangs	count	2.000000
	mean	2221235.000000
	std	947786.130513
	min	1551049.000000
	25%	1886142.000000
	50%	2221235.000000
	75%	2556328.000000
	max	2891421.000000
Beijing Jidongxing	count	2.000000
	mean	758039.000000
	std	428339.832199
	min	455157.000000
	25%	606598.000000
	50%	758039.000000
	75%	909480.000000
	max	1060921.000000
Beijing Shidai	count	2.000000
	mean	1477969.000000
	std	406604.783959
	min	1190456.000000
	25%	1334212.500000
	50%	1477969.000000
...	...	...
Xiying Huayi	std	91184.247861
	min	1418495.000000
	25%	1450733.500000
	50%	1482972.000000
	75%	1515210.500000
	max	1547449.000000
Zijing	count	1.000000
	mean	257931.000000
	std	NaN
	min	257931.000000
	25%	257931.000000
	50%	257931.000000
	75%	257931.000000
	max	257931.000000
Zijing Sanlian	count	3.000000
	mean	761744.333333
	std	451443.828138
	min	339156.000000
	25%	523936.500000
	50%	708717.000000
	75%	973038.500000
	max	1237360.000000
Zinjing Cheng Sanlia	count	1.000000
	mean	155205.000000
	std	NaN
	min	155205.000000
	25%	155205.000000
	50%	155205.000000
	75%	155205.000000
	max	155205.000000

280 rows × 1 columns

In [11]:

gd_over_time = gross_data.set_index(pd.DatetimeIndex(gross_data.date))
gd_over_time.drop('date', axis=1, inplace=True)
gd_over_time.head()

Out[11]:

	title	distributor	gross
2014-06-27	Transformers: Age of Extinction	HuaXia	301000000
2014-01-31	The Monkey King	NaN	167840000
2014-05-23	X-Men: Days of Future Past	HuaXia	116490000
2014-04-04	Captain America: The Winter Soldier	China Film	115620000
2014-01-31	Dad, Where Are We Going?	NaN	111870000

In [22]:

gd_by_month = gd_over_time.resample("M")
gd_by_month.dropna(inplace=True)

In [25]:

gd_by_month.groupby(gd_by_month.index).sum().head()

Out[25]:

	gross
2007-01-31	1201837.285714
2007-02-28	3411974.000000
2007-03-31	1208611.083333
2007-04-30	1148178.769231
2007-05-31	6040519.500000

In [24]:

plt.figure(figsize=(10,5))
plt.scatter(gd_by_month.index, gd_by_month.gross)
plt.xlabel('Date')
plt.ylabel('Total Gross')
plt.title('Box Office Grosses Over Time')

Out[24]:

<matplotlib.text.Text at 0x10cf97410>

In [ ]: