import pymc3 as pm
import numpy as np
import pandas as pd
from theano import shared
import scipy.stats as stats
import matplotlib.pyplot as plt
import arviz as az
plt.style.use('ggplot')
df_movies = pd.read_excel("rundown2.xlsx", sheet_name="list")
df_movies_2018 = df_movies[df_movies.year == 2018]
df_movies_2018.head()
x = df_movies_2018.opening / 1e6
y = df_movies_2018.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x, y, 'C0.')
ax[0].set_xlabel('opening 2018')
ax[0].set_ylabel('total 2018', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2018')
plt.tight_layout()
with pm.Model() as model_g:
α = pm.Normal('α', mu=0, sd=10)
β = pm.Normal('β', mu=0, sd=1)
ϵ = pm.HalfCauchy('ϵ', 5)
μ = pm.Deterministic('μ', α + β * x)
y_pred = pm.Normal('y_pred', mu=μ, sd=ϵ, observed=y)
trace_g = pm.sample(2000, tune=1000)
varnames=['α', 'β', 'ϵ']
az.plot_trace(trace_g, varnames)
pm.autocorrplot(trace_g, varnames)
pm.summary(trace_g, varnames)
plt.figure(figsize=(8,8))
plt.plot(x, y, 'b.');
alpha_m = trace_g['α'].mean()
beta_m = trace_g['β'].mean()
plt.plot(x, alpha_m + beta_m * x, c='k', label='y = {:.2f} + {:.2f} * x'.format(alpha_m, beta_m))
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$y$', fontsize=16, rotation=0)
plt.legend(loc=2, fontsize=14)
ppc = pm.sample_posterior_predictive(trace_g, samples=1000, model=model_g)
idx = np.argsort(x)
x_ord = x[idx]
plt.figure(figsize=(8,8))
plt.plot(x, y, 'b.')
plt.plot(x, alpha_m + beta_m * x, c='k', label='y = {:.2f} + {:.2f} * x'.format(alpha_m, beta_m))
sig0 = pm.hpd(ppc['y_pred'], alpha=0.5)[idx]
sig1 = pm.hpd(ppc['y_pred'], alpha=0.05)[idx]
plt.fill_between(x_ord, sig0[:,0], sig0[:,1], color='gray', alpha=1)
plt.fill_between(x_ord, sig1[:,0], sig1[:,1], color='gray', alpha=0.5)
plt.xlabel('opening 2018', fontsize=16)
plt.ylabel('total 2018', fontsize=16, rotation=90)
x_st = ( x - x.mean() ) / x.std()
y_st = ( y - y.mean() ) / y.std()
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x_st, y_st, 'C0.')
ax[0].set_xlabel('opening 2018 (st)')
ax[0].set_ylabel('total 2018 (st)', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y_st, ax=ax[1])
ax[1].set_xlabel('total 2018')
plt.tight_layout()
with pm.Model() as model_g_st:
α = pm.Normal('α', mu=0, sd=10)
β = pm.Normal('β', mu=0, sd=1)
ϵ = pm.HalfCauchy('ϵ', 5)
μ = pm.Deterministic('μ', α + β * x_st)
y_pred = pm.Normal('y_pred', mu=μ, sd=ϵ, observed=y_st)
trace_g_st = pm.sample(2000, tune=1000)
varnames=['α', 'β', 'ϵ']
az.plot_trace(trace_g_st, varnames)
pm.autocorrplot(trace_g_st, varnames)
pm.summary(trace_g_st, varnames)
plt.figure(figsize=(8,8))
plt.plot(x_st, y_st, 'b.');
alpha_m_st = trace_g_st['α'].mean()
beta_m_st = trace_g_st['β'].mean()
plt.plot(x_st, alpha_m_st + beta_m_st * x_st, c='k', label='y_st = {:.2f} + {:.2f} * x_st'.format(alpha_m_st, beta_m_st))
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$y$', fontsize=16, rotation=0)
plt.legend(loc=2, fontsize=14)
np.mean((y_hat, y_hat_dst), axis=0)
y_hat = alpha_m + beta_m * x
y_hat_dst = ((alpha_m_st + beta_m_st * x_st) * y.std()) + y.mean()
# mean absolute percentage diff bt standardized, non-standardized models
# not interested in identifying superior accuracy given ease of use of non-standardized model
np.mean(np.abs(y_hat - y_hat_dst) / np.mean((y_hat, y_hat_dst), axis=0))
plt.figure(figsize=(8,8))
plt.plot(x, y, 'b.');
alpha_m = trace_g['α'].mean()
beta_m = trace_g['β'].mean()
plt.plot(x, alpha_m + beta_m * x, c='k', label='y = {:.2f} + {:.2f} * x'.format(alpha_m, beta_m))
plt.plot(x, y_hat_dst, label='y de-standardized')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$y$', fontsize=16, rotation=0)
plt.legend(loc=2, fontsize=14)
with pm.Model() as model_t:
alpha = pm.Normal('alpha', mu=0, sd=10)
beta = pm.Normal('beta', mu=0, sd=1)
epsilon = pm.HalfCauchy('epsilon', 5)
nu = pm.Deterministic('nu', pm.Exponential('nu_', 1/29) + 1)
y_pred = pm.StudentT('y_pred', mu=alpha + beta * x, sd=epsilon, nu=nu, observed=y)
# start = pm.find_MAP()
# step = pm.NUTS(scaling=start)
trace_t = pm.sample(2000, tune=1000)
varnames_t = ['alpha', 'beta', 'epsilon', 'nu']
pm.traceplot(trace_t, varnames_t)
pm.summary(trace_t, varnames_t)
df_movies_2017 = df_movies[df_movies.year == 2017]
df_movies_2017.head()
x_17 = df_movies_2017.opening / 1e6
y_17 = df_movies_2017.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(8, 4))
ax[0].plot(x_17, y_17, 'C0.')
ax[0].set_xlabel('opening 2017')
ax[0].set_ylabel('total 2017', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2017')
plt.tight_layout()
alpha_mt = trace_t['alpha'].mean()
beta_mt = trace_t['beta'].mean()
plt.figure(figsize=(8,8))
plt.plot(x_17, y_17, '.')
plt.plot(x_17, alpha_m + beta_m * x_17, label="Gaussian")
plt.plot(x_17, alpha_mt + beta_mt * x_17, label='robust')
plt.legend(loc=2, fontsize=12)
plt.tight_layout()
def rmse(y_s, y_hat):
return np.sqrt(np.mean(np.square(y_s - y_hat)))
rmse(y_17, alpha_m + beta_m * x_17), rmse(y_17, alpha_mt + beta_mt * x_17)
df_movies.pivot_table('year_total', 'year').sort_index(ascending=False).head(12)
not_training = df_movies.year != 2018
ten_bill_plus = df_movies.year >= 2009
df_movies_test = df_movies[not_training & ten_bill_plus]
df_movies_test.describe()
x_test = df_movies_test.opening / 1e6
y_test = df_movies_test.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x_test, y_test, 'C0.')
ax[0].set_xlabel('opening 2009 - 17')
ax[0].set_ylabel('total 2009 - 17', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2009 - 17')
plt.tight_layout()
plt.plot(x_test, y_test, '.')
plt.plot(x_test, alpha_m + beta_m * x_test, label="Gaussian")
plt.plot(x_test, alpha_mt + beta_mt * x_test, label='robust')
plt.legend(loc=2, fontsize=12)
plt.tight_layout()
rmse(y_test, alpha_m + beta_m * x_test), rmse(y_test, alpha_mt + beta_mt * x_test)
df_movies_2019 = pd.read_excel("../../20190520_wta_movies/rundown2.xlsx", sheet_name="2019_YTD (Oct 16)")
df_movies_2019 = df_movies_2019[df_movies_2019.year == 2019]
print(df_movies_2019.shape)
df_movies_2019.head()
df_movies_2019_closed = df_movies_2019[df_movies_2019.close != "-"]
print(df_movies_2019_closed.shape)
df_movies_2019_closed.head(10)
x_19 = df_movies_2019_closed.opening / 1e6
y_19 = df_movies_2019_closed.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x_19, y_19, 'C0.')
ax[0].set_xlabel('opening 2019')
ax[0].set_ylabel('total 2019', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2019')
plt.tight_layout()
plt.figure(figsize=(8,8))
plt.plot(x_19, y_19, '.')
plt.plot(x_19, alpha_m + beta_m * x_19, label="Gaussian")
plt.legend(loc=2, fontsize=12)
plt.tight_layout()
jw3_opening = df_movies_2019_closed.loc[10, 'opening']
alpha_m + beta_m * jw3_opening
jw3_total = df_movies_2019_closed.loc[10, 'gross']
jw3_total