%matplotlib inline
import pymc3 as pm
import numpy as np
import pandas as pd
from scipy import stats
# R-like interface, alternatively you can import statsmodels as import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.style.use(['seaborn-colorblind', 'seaborn-darkgrid'])
/home/agustina/anaconda3/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
data = {'species' : ['afarensis', 'africanus', 'habilis', 'boisei', 'rudolfensis', 'ergaster', 'sapiens'],
'brain' : [438, 452, 612, 521, 752, 871, 1350],
'mass' : [37., 35.5, 34.5, 41.5, 55.5, 61.0, 53.5]}
d = pd.DataFrame(data)
d
brain | mass | species | |
---|---|---|---|
0 | 438 | 37.0 | afarensis |
1 | 452 | 35.5 | africanus |
2 | 612 | 34.5 | habilis |
3 | 521 | 41.5 | boisei |
4 | 752 | 55.5 | rudolfensis |
5 | 871 | 61.0 | ergaster |
6 | 1350 | 53.5 | sapiens |
m_6_1 = smf.ols('brain ~ mass', data=d).fit()
1 - m_6_1.resid.var()/d.brain.var()
# m_6_1.summary() check the value for R-squared
0.4901580479490838
m_6_2 = smf.ols('brain ~ mass + I(mass**2)', data=d).fit()
m_6_3 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3)', data=d).fit()
m_6_4 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4)', data=d).fit()
m_6_5 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4) + I(mass**5)', data=d).fit()
m_6_6 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4) + I(mass**5) + I(mass**6)', data=d).fit()
m_6_7 = smf.ols('brain ~ 1', data=d).fit()
d_new = d.drop(d.index[-1])
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(8,3))
ax1.scatter(d.mass, d.brain, alpha=0.8)
ax2.scatter(d.mass, d.brain, alpha=0.8)
for i in range(len(d)):
d_new = d.drop(d.index[-i])
m0 = smf.ols('brain ~ mass', d_new).fit()
# need to calculate regression line
# need to add intercept term explicitly
x = sm.add_constant(d_new.mass) # add constant to new data frame with mass
x_pred = pd.DataFrame({'mass': np.linspace(x.mass.min() - 10, x.mass.max() + 10, 50)}) # create linspace dataframe
x_pred2 = sm.add_constant(x_pred) # add constant to newly created linspace dataframe
y_pred = m0.predict(x_pred2) # calculate predicted values
ax1.plot(x_pred, y_pred, 'gray', alpha=.5)
ax1.set_ylabel('body mass (kg)', fontsize=12);
ax1.set_xlabel('brain volume (cc)', fontsize=12)
ax1.set_title('Underfit model')
# fifth order model
m1 = smf.ols('brain ~ mass + I(mass**2) + I(mass**3) + I(mass**4) + I(mass**5)', data=d_new).fit()
x = sm.add_constant(d_new.mass) # add constant to new data frame with mass
x_pred = pd.DataFrame({'mass': np.linspace(x.mass.min()-10, x.mass.max()+10, 200)}) # create linspace dataframe
x_pred2 = sm.add_constant(x_pred) # add constant to newly created linspace dataframe
y_pred = m1.predict(x_pred2) # calculate predicted values from fitted model
ax2.plot(x_pred, y_pred, 'gray', alpha=.5)
ax2.set_xlim(32,62)
ax2.set_ylim(-250, 2200)
ax2.set_ylabel('body mass (kg)', fontsize=12);
ax2.set_xlabel('brain volume (cc)', fontsize=12)
ax2.set_title('Overfit model')
plt.show()
p = (0.3, 0.7)
-sum(p * np.log(p))
0.6108643020548935
# fit model
m_6_1 = smf.ols('brain ~ mass', data=d).fit()
#compute de deviance by cheating
-2 * m_6_1.llf
94.924989685887581
# standarize the mass before fitting
d['mass_s'] = d['mass'] - np.mean(d['mass'] / np.std(d['mass']))
with pm.Model() as m_6_8 :
a = pm.Normal('a', mu=np.mean(d['brain']), sd=10)
b = pm.Normal('b', mu=0, sd=10)
sigma = pm.Uniform('sigma', 0, np.std(d['brain']) * 10)
mu = pm.Deterministic('mu', a + b * d['mass_s'])
brain = pm.Normal('brain', mu = mu, sd = sigma, observed = d['brain'])
m_6_8 = pm.sample(1000, tune=1000)
100%|██████████| 2000/2000 [00:05<00:00, 338.85it/s]
theta = pm.summary(m_6_8)['mean'][:3]
#compute deviance
dev = - 2 * sum(stats.norm.logpdf(d['brain'], loc = theta[0] + theta[1] * d['mass_s'] , scale = theta[2]))
dev
100.48254797793284
The overthinking section corresponding to cells 6.12-14 is not ported because it requires an ad-hoc rethinking package function. Feel free to contribute code to this section.
data = pd.read_csv('Data/cars.csv', sep=',')
with pm.Model() as m_6_15 :
a = pm.Normal('a', mu=0, sd=100)
b = pm.Normal('b', mu=0, sd=10)
sigma = pm.Uniform('sigma', 0, 30)
mu = pm.Deterministic('mu', a + b * data['speed'])
dist = pm.Normal('dist', mu=mu, sd=sigma, observed = data['dist'])
m_6_15 = pm.sample(1000, tune=1000)
100%|██████████| 2000/2000 [00:12<00:00, 161.13it/s]
n_samples = 1000
n_cases = data.shape[0]
ll = np.zeros((n_cases, n_samples))
for s in range(0, n_samples):
mu = m_6_15['a'][s] + m_6_15['b'][s] * data['speed']
p_ = stats.norm.logpdf(data['dist'], loc=mu, scale=m_6_15['sigma'][s])
ll[:,s] = p_
from scipy.misc import logsumexp
n_cases = data.shape[0]
lppd = np.zeros((n_cases))
for a in range(1, n_cases):
lppd[a,] = logsumexp(ll[a,]) - np.log(n_samples)
/home/agustina/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: DeprecationWarning: `logsumexp` is deprecated! Importing `logsumexp` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.logsumexp` instead.
pWAIC = np.zeros((n_cases))
for i in range(1, n_cases):
pWAIC[i,] = np.var(ll[i,])
- 2 * (sum(lppd) - sum(pWAIC))
412.49046436448634
waic_vec = - 2 * (lppd - pWAIC)
np.sqrt(n_cases * np.var(waic_vec))
14.88328369492803
d = pd.read_csv('Data/milk.csv', sep=';')
d['neocortex'] = d['neocortex.perc'] / 100
d.dropna(inplace=True)
d.shape
(17, 9)
a_start = d['kcal.per.g'].mean()
sigma_start = d['kcal.per.g'].std()
import theano
mass_shared = theano.shared(np.log(d['mass'].values))
neocortex_shared = theano.shared(d['neocortex'].values)
with pm.Model() as m6_11:
alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start)
mu = alpha + 0 * neocortex_shared
sigma = pm.HalfCauchy('sigma',beta=10, testval=sigma_start)
kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g'])
trace_m6_11 = pm.sample(1000, tune=1000)
with pm.Model() as m6_12:
alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start)
beta = pm.Normal('beta', mu=0, sd=10)
sigma = pm.HalfCauchy('sigma',beta=10, testval=sigma_start)
mu = alpha + beta * neocortex_shared
kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g'])
trace_m6_12 = pm.sample(1000, tune=1000)
with pm.Model() as m6_13:
alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start)
beta = pm.Normal('beta', mu=0, sd=10)
sigma = pm.HalfCauchy('sigma', beta=10, testval=sigma_start)
mu = alpha + beta * mass_shared
kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g'])
trace_m6_13 = pm.sample(1000, tune=1000)
with pm.Model() as m6_14:
alpha = pm.Normal('alpha', mu=0, sd=10, testval=a_start)
beta = pm.Normal('beta', mu=0, sd=10, shape=2)
sigma = pm.HalfCauchy('sigma', beta=10, testval=sigma_start)
mu = alpha + beta[0] * mass_shared + beta[1] * neocortex_shared
kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g'])
trace_m6_14 = pm.sample(1000, tune=1000)
100%|██████████| 2000/2000 [00:08<00:00, 243.55it/s] 87%|████████▋ | 1747/2000 [02:33<00:22, 11.39it/s]/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 1 contains 1 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging)) 100%|█████████▉| 1999/2000 [02:45<00:00, 12.10it/s]/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 0 contains 2 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging)) 100%|██████████| 2000/2000 [02:45<00:00, 12.10it/s] 100%|██████████| 2000/2000 [00:28<00:00, 69.65it/s] 100%|██████████| 2000/2000 [04:31<00:00, 7.38it/s]/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:452: UserWarning: The acceptance probability in chain 0 does not match the target. It is 0.681715916019, but should be close to 0.8. Try to increase the number of tuning steps. % (self._chain_id, mean_accept, target_accept)) /home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 0 contains 25 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging)) /home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:468: UserWarning: Chain 1 contains 5 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging))
pm.waic(trace_m6_14, m6_14)
/home/agustina/anaconda3/lib/python3.5/site-packages/pymc3/stats.py:220: UserWarning: For one or more samples the posterior variance of the log predictive densities exceeds 0.4. This could be indication of WAIC starting to fail see http://arxiv.org/abs/1507.04544 for details """)
WAIC_r(WAIC=-17.054227823698863, WAIC_se=4.8600477062566103, p_WAIC=2.9504889764816951)
compare_df = pm.compare([trace_m6_11, trace_m6_12, trace_m6_13, trace_m6_14],
[m6_11, m6_12, m6_13, m6_14],
method='pseudo-BMA')
compare_df.loc[:,'model'] = pd.Series(['m6.11', 'm6.12', 'm6.13', 'm6.14'])
compare_df = compare_df.set_index('model')
compare_df
WAIC | pWAIC | dWAIC | weight | SE | dSE | warning | |
---|---|---|---|---|---|---|---|
model | |||||||
m6.14 | -17.05 | 2.95 | 0 | 0.96 | 4.86 | 0 | 1 |
m6.13 | -9.22 | 1.88 | 7.84 | 0.02 | 3.99 | 3.25 | 1 |
m6.11 | -8.63 | 1.36 | 8.43 | 0.01 | 3.58 | 4.61 | 1 |
m6.12 | -7.18 | 1.93 | 9.88 | 0.01 | 3.15 | 4.76 | 1 |
pm.compareplot(compare_df);
diff = np.random.normal(loc=6.7, scale=7.26, size=100000)
sum(diff[diff<0]) / 100000
-0.69650384567058254
Compare function already checks number of observations to be equal.
coeftab = pd.DataFrame({'m6_11': pm.summary(trace_m6_11)['mean'],
'm6_12': pm.summary(trace_m6_12)['mean'],
'm6_13': pm.summary(trace_m6_13)['mean'],
'm6_14': pm.summary(trace_m6_14)['mean']})
coeftab
m6_11 | m6_12 | m6_13 | m6_14 | |
---|---|---|---|---|
alpha | 0.657464 | 0.342740 | 0.706000 | -1.074019 |
beta | NaN | 0.466021 | -0.031884 | NaN |
beta__0 | NaN | NaN | NaN | -0.095749 |
beta__1 | NaN | NaN | NaN | 2.774855 |
sigma | 0.188901 | 0.192944 | 0.183208 | 0.139930 |
traces = [trace_m6_11, trace_m6_12, trace_m6_13, trace_m6_14]
models = [m6_11, m6_12, m6_13, m6_14]
plt.figure(figsize=(10, 8))
pm.forestplot(traces, plot_kwargs={'fontsize':14});
kcal_per_g = np.repeat(0, 30) # empty outcome
neocortex = np.linspace(0.5, 0.8, 30) # sequence of neocortex
mass = np.repeat(4.5, 30) # average mass
mass_shared.set_value(np.log(mass))
neocortex_shared.set_value(neocortex)
post_pred = pm.sample_ppc(trace_m6_14, samples=10000, model=m6_14)
100%|██████████| 10000/10000 [00:37<00:00, 264.13it/s]
milk_ensemble = pm.sample_ppc_w(traces, 10000,
models, weights=compare_df.weight.sort_index(ascending=True))
100%|██████████| 10000/10000 [00:28<00:00, 355.45it/s]
plt.figure(figsize=(8, 6))
plt.plot(neocortex, post_pred['kcal'].mean(0), ls='--', color='C2')
hpd_post_pred = pm.hpd(post_pred['kcal'])
plt.plot(neocortex,hpd_post_pred[:,0], ls='--', color='C2')
plt.plot(neocortex,hpd_post_pred[:,], ls='--', color='C2')
plt.plot(neocortex, milk_ensemble['kcal'].mean(0), color='C0')
hpd_av = pm.hpd(milk_ensemble['kcal'])
plt.fill_between(neocortex, hpd_av[:,0], hpd_av[:,1], alpha=0.1, color='C0')
plt.scatter(d['neocortex'], d['kcal.per.g'], facecolor='None', edgecolors='C0')
plt.ylim(0.3, 1)
plt.xlabel('neocortex', fontsize=16)
plt.ylabel('kcal.per.g', fontsize=16);
import sys, IPython, scipy, matplotlib, platform
print("This notebook was createad on a computer %s running %s and using:\nPython %s\nIPython %s\nPyMC3 %s\nNumPy %s\nPandas %s\nSciPy %s\nMatplotlib %s\n" % (platform.machine(), ' '.join(platform.linux_distribution()[:2]), sys.version[:5], IPython.__version__, pm.__version__, np.__version__, pd.__version__, scipy.__version__, matplotlib.__version__))
This notebook was createad on a computer x86_64 running debian stretch/sid and using: Python 3.5.4 IPython 4.1.2 PyMC3 3.2 NumPy 1.13.3 Pandas 0.21.0 SciPy 1.0.0 Matplotlib 2.0.2