# Chapter 6. Overfitting, Regularization, and Information Criteria¶

In [1]:
import pandas as pd
import seaborn as sns
import torch
from torch.distributions import constraints

import pyro
import pyro.distributions as dist
import pyro.ops.stats as stats

from rethinking import (LM, MAP, coef, compare, ensemble, extract_samples,


### Code 6.1¶

In [2]:
sppnames = ["afarensis", "africanus", "habilis", "boisei", "rudolfensis", "ergaster",
"sapiens"]
brainvolcc = torch.tensor([438., 452, 612, 521, 752, 871, 1350])
masskg = torch.tensor([37.0, 35.5, 34.5, 41.5, 55.5, 61.0, 53.5])
d = pd.DataFrame({"species": sppnames, "brain": brainvolcc, "mass": masskg})


### Code 6.2¶

In [3]:
m6_1 = LM("brain ~ mass", data=d)
m6_1.model = pyro.do(m6_1.model, data={"sigma": 1})
m6_1.run();


### Code 6.3¶

In [4]:
resid = brainvolcc - (coef(m6_1)["Intercept"] + coef(m6_1)["mass"] * masskg)
1 - resid.var() / brainvolcc.var()

Out[4]:
tensor(0.4902)

### Code 6.4¶

In [5]:
m6_2 = LM("brain ~ mass + I(mass ** 2)", data=d).run()


### Code 6.5¶

In [6]:
m6_3 = LM("brain ~ mass + I(mass ** 2) + I(mass ** 3)", data=d).run()
m6_4 = LM("brain ~ mass + I(mass ** 2) + I(mass ** 3) + I(mass ** 4)", data=d).run()
torch.set_default_dtype(torch.double)
m6_5 = LM("brain ~ mass + I(mass ** 2) + I(mass ** 3) + I(mass ** 4) "
"+ I(mass ** 5)", data=d)
m6_5.model = pyro.do(m6_5.model, data={"sigma": 1})
m6_5.run()
m6_6 = LM("brain ~ mass + I(mass ** 2) + I(mass ** 3) + I(mass ** 4) "
"+ I(mass ** 5) + I(mass ** 6)",
data=d)
m6_6.model = pyro.do(m6_6.model, data={"sigma": 1})
m6_6.run()
torch.set_default_dtype(torch.float)


### Code 6.6¶

In [7]:
m6_7 = LM("brain ~ 1", data=d).run()


### Code 6.7¶

d_new = d.drop(index=i)
d_new.index = range(d_new.shape[0])


### Code 6.8¶

In [8]:
sns.scatterplot("mass", "brain", data=d)
for i in range(d.shape[0]):
d_new = d.drop(index=i)
d_new.index = range(d_new.shape[0])
m0 = LM("brain ~ mass", data=d_new)
m0.model = pyro.do(m0.model, data={"sigma": 1})
m0.run()
x = torch.linspace(34, 62, 101)
sns.lineplot(x, coef(m0)["Intercept"] + coef(m0)["mass"] * x, color="k", alpha=0.5)


### Code 6.9¶

In [9]:
p = torch.tensor([0.3, 0.7])
-(p * p.log()).sum()

Out[9]:
tensor(0.6109)

### Code 6.10¶

In [10]:
# fit model with lm
m6_1 = LM("brain ~ mass", data=d).run()
m6_1.model = pyro.do(m6_1.model, data={"sigma": 1})
m6_1.run()

# compute deviance by cheating
coefs = coef(m6_1)
resid = brainvolcc - (coefs["Intercept"] + coefs["mass"] * masskg)
sigma = resid.pow(2).mean().sqrt()
coefs["sigma"] = sigma
model = pyro.do(LM("brain ~ mass", data=d, centering=False).model, data=coefs)
(-2) * pyro.poutine.trace(model).get_trace().log_prob_sum()

Out[10]:
tensor(94.9250)

### Code 6.11¶

In [11]:
# standardize the mass before fitting
mass_s = (masskg - masskg.mean()) / masskg.std()

def model(mass, brain):
a = pyro.param("a", brainvolcc.mean())
b = pyro.param("b", torch.tensor(0.))
mu = a + b * mass
sigma = pyro.param("sigma", brainvolcc.std(), constraints.positive)
with pyro.plate("plate"):
pyro.sample("brain", dist.Normal(mu, sigma), obs=brain)

m6_8 = MAP(model).run(mass_s, brainvolcc)

# extract MAP estimates
theta = list(coef(m6_8).values())

# compute deviance
dev = (-2) * dist.Normal(theta[0] + theta[1] * mass_s,
theta[2]).log_prob(brainvolcc).sum()
dev

Out[11]:
tensor(94.9536)

### Code 6.12¶

In [12]:
def model(mm, y, b_sigma):
a = pyro.param("a", torch.tensor(0.))
Bvec = a.unsqueeze(0)
k = mm.size(1)
if k > 1:
with pyro.plate("plate_b", k - 1):
b = pyro.sample("b", dist.Normal(0, b_sigma))
Bvec = torch.cat([Bvec, b])
mu = mm.matmul(Bvec)
with pyro.plate("plate"):
pyro.sample("y", dist.Normal(mu, 1), obs=y)

def sim_train_test(N=20, k=3, rho=[0.15, -0.4], b_sigma=100):
n_dim = max(k, 3)
Rho = torch.eye(n_dim)
Rho[1:len(rho) + 1, 0] = torch.tensor(rho)
Rho[0, 1:len(rho) + 1] = torch.tensor(rho)

X_train = dist.MultivariateNormal(torch.zeros(n_dim), Rho).sample(torch.Size([N]))
mm_train = torch.ones(N, 1)
if k > 1:
mm_train = torch.cat([mm_train, X_train[:, 1:k]], dim=1)

m = MAP(model, start={"b": torch.zeros(k - 1)}, num_samples=10)
m.run(mm_train, X_train[:, 0], b_sigma)
coefs = torch.cat([c.reshape(-1) for c in coef(m).values()])
dev_train = (-2) * dist.Normal(mm_train.matmul(coefs),
1).log_prob(X_train[:, 0]).sum()

X_test = dist.MultivariateNormal(torch.zeros(n_dim), Rho).sample(torch.Size([N]))
mm_test = torch.ones(N, 1)
if k > 1:
mm_test = torch.cat([mm_test, X_test[:, 1:k]], dim=1)
dev_test = (-2) * dist.Normal(mm_test.matmul(coefs),
1).log_prob(X_test[:, 0]).sum()

def dev_fn(N, k):
print(k)
r = torch.stack(replicate(int(1e4), sim_train_test, (N, k)), dim=1)

N = 20
kseq = range(1, 6)
dev = torch.stack([dev_fn(N, k) for k in kseq], dim=1)

1
2
3
4
5


### Code 6.13¶

r = torch.stack(replicate(int(1e4), sim_train_test, (N, k), mc_cores=4), dim=1)


### Code 6.14¶

In [13]:
ax = sns.scatterplot(torch.arange(1, 6), dev[0], s=80, color="b")
ax.set(xlabel="number of parameters", ylabel="deviance", title="N = {}".format(N))
sns.scatterplot(torch.arange(1.1, 6), dev[1], s=80, color="k")
pts_int = (dev[0] - dev[2], dev[0] + dev[2])
pts_out = (dev[1] - dev[3], dev[1] + dev[3])
ax.vlines(torch.arange(1, 6), pts_int[0], pts_int[1], color="b")
ax.vlines(torch.arange(1.1, 6), pts_out[0], pts_out[1], color="k")
ax.annotate("in", (2, dev[0][1]), xytext=(-25, -5),
textcoords="offset pixels", color="b")
ax.annotate("out", (2.1, dev[1][1]), xytext=(10, -5), textcoords="offset pixels")
ax.annotate("-1SD", (2.1, pts_out[1][1]), xytext=(10, -5),
textcoords="offset pixels", fontsize=12)
ax.annotate("+1SD", (2.1, pts_out[0][1]), xytext=(10, -5),
textcoords="offset pixels", fontsize=12);


### Code 6.15¶

In [14]:
cars = pd.read_csv("../data/cars.csv", sep=",")
speed = torch.tensor(cars["speed"], dtype=torch.float)
distance = torch.tensor(cars["dist"], dtype=torch.float)

def model(speed, distance):
a = pyro.sample("a", dist.Normal(0, 100))
b = pyro.sample("b", dist.Normal(0, 10))
mu = a + b * speed
sigma = pyro.sample("sigma", dist.Uniform(0, 30))
with pyro.plate("plate"):
pyro.sample("dist", dist.Normal(mu, sigma), obs=distance)

m = MAP(model).run(speed, distance)
idx = m._categorical.sample(torch.Size([1000]))
post = {latent: samples[idx] for latent, samples in extract_samples(m).items()}


### Code 6.16¶

In [15]:
n_samples = 1000

def ll_fn(s):
mu = post["a"][s] + post["b"][s] * speed
return dist.Normal(mu, post["sigma"][s]).log_prob(distance)

ll = torch.stack([ll_fn(s) for s in range(n_samples)], dim=1)


### Code 6.17¶

In [16]:
n_cases = cars.shape[0]
lppd = ll.logsumexp(1) - torch.tensor(float(n_samples)).log()


### Code 6.18¶

In [17]:
pWAIC = ll.var(1)


### Code 6.19¶

In [18]:
-2 * (lppd.sum() - pWAIC.sum())

Out[18]:
tensor(421.0318)

### Code 6.20¶

In [19]:
waic_vec = -2 * (lppd - pWAIC)
(n_cases * waic_vec.var()).sqrt()

Out[19]:
tensor(14.3545)

### Code 6.21¶

In [20]:
milk = pd.read_csv("../data/milk.csv", sep=";")
d = milk.dropna().copy()
d["neocortex"] = d["neocortex.perc"] / 100
d.shape

Out[20]:
(17, 9)

### Code 6.22¶

In [21]:
kcal_per_g = torch.tensor(d["kcal.per.g"], dtype=torch.float)
a_start = kcal_per_g.mean()
sigma_start = kcal_per_g.std().log()

def model(kcal_per_g):
a = pyro.param("a", a_start)
log_sigma = pyro.param("log.sigma", sigma_start)
with pyro.plate("plate"):
pyro.sample("kcal.per.g", dist.Normal(a, log_sigma.exp()), obs=kcal_per_g)

m6_11 = MAP(model).run(kcal_per_g)

def model(neocortex, kcal_per_g):
a = pyro.param("a", a_start)
bn = pyro.param("bn", torch.tensor(0.))
mu = a + bn * neocortex
log_sigma = pyro.param("log.sigma", sigma_start)
with pyro.plate("plate"):
pyro.sample("kcal.per.g", dist.Normal(mu, log_sigma.exp()), obs=kcal_per_g)

neocortex = torch.tensor(d["neocortex"], dtype=torch.float)
m6_12 = MAP(model).run(neocortex, kcal_per_g)

def model(mass, kcal_per_g):
a = pyro.param("a", a_start)
bm = pyro.param("bm", torch.tensor(0.))
mu = a + bm * mass.log()
log_sigma = pyro.param("log.sigma", sigma_start)
with pyro.plate("plate"):
pyro.sample("kcal.per.g", dist.Normal(mu, log_sigma.exp()), obs=kcal_per_g)

mass = torch.tensor(d["mass"], dtype=torch.float)
m6_13 = MAP(model).run(mass, kcal_per_g)

def model(neocortex, mass, kcal_per_g):
a = pyro.param("a", a_start)
bn = pyro.param("bn", torch.tensor(0.))
bm = pyro.param("bm", torch.tensor(0.))
mu = a + bn * neocortex + bm * mass.log()
log_sigma = pyro.param("log.sigma", sigma_start)
with pyro.plate("plate"):
pyro.sample("kcal.per.g", dist.Normal(mu, log_sigma.exp()), obs=kcal_per_g)

m6_14 = MAP(model).run(neocortex, mass, kcal_per_g)


### Code 6.23¶

In [22]:
with torch.no_grad():
print(m6_14.information_criterion())

OrderedDict([('waic', tensor(-15.1608)), ('p_waic', tensor(4.7740))])


### Code 6.24¶

In [23]:
milk_models = compare({"m6.11": m6_11, "m6.12": m6_12, "m6.13": m6_13, "m6.14": m6_14})
milk_models.round(2)

Out[23]:
WAIC pWAIC dWAIC weight SE dSE
m6.14 -15.16 4.77 0.00 0.93 7.73 0.00
m6.11 -8.22 1.85 6.94 0.03 4.67 7.23
m6.13 -8.01 2.97 7.16 0.03 5.81 5.26
m6.12 -6.45 2.79 8.71 0.01 4.37 7.52

### Code 6.25¶

In [24]:
ax = sns.pointplot(milk_models["WAIC"], milk_models.index, join=False)
ax.errorbar(milk_models["WAIC"], milk_models.index, xerr=milk_models["SE"], fmt="none")
sns.pointplot(milk_models["WAIC"] - 2 * milk_models["pWAIC"],
milk_models.index, color="k", join=False)
ax.errorbar(milk_models["WAIC"][1:], [0.5, 1.5, 2.5],
xerr=milk_models["dSE"][1:], fmt="^")
ax.axvline(milk_models["WAIC"][0], c="k", alpha=0.2)
ax.yaxis.grid(True)


### Code 6.26¶

In [25]:
dWAIC = milk_models.loc["m6.11", "dWAIC"]
dSE = milk_models.loc["m6.11", "dSE"]
diff = torch.empty(int(1e5)).normal_(dWAIC, dSE)
(diff < 0).float().sum() / 1e5

Out[25]:
tensor(0.1719)

### Code 6.27¶

In [26]:
pd.concat([precis(m6_11), precis(m6_12), precis(m6_13), precis(m6_14)],
keys=["m6.11", "m6.12", "m6.13", "m6.14"]).unstack(level=0)["Mean"]

Out[26]:
m6.11 m6.12 m6.13 m6.14
a 0.66 0.35 0.71 -1.09
bm NaN NaN -0.03 -0.10
bn NaN 0.45 NaN 2.80
log.sigma -1.78 -1.79 -1.85 -2.17

### Code 6.28¶

In [27]:
coeftab = pd.concat([precis(m6_11), precis(m6_12), precis(m6_13), precis(m6_14)],
keys=["m6.11", "m6.12", "m6.13", "m6.14"]).unstack(level=0)
mean = coeftab["Mean"].iloc[:, ::-1].stack(dropna=False)
error = mean - coeftab["|0.89"].iloc[:, ::-1].stack(dropna=False)
ax = sns.pointplot(mean, mean.index, join=False, facecolors="none")
ax.errorbar(mean, range(mean.shape[0]), xerr=error, fmt="none", c="k")
ax.set(xlabel="Estimate")
for i in range(1, coeftab.shape[0]):
ax.axhline(i * coeftab["Mean"].shape[1] - 0.5, c="k", alpha=0.2, linestyle="--")
ax.yaxis.grid(True)


### Code 6.29¶

In [28]:
# compute counterfactual predictions
# neocortex from 0.5 to 0.8
nc_seq = torch.linspace(start=0.5, end=0.8, steps=30)
d_predict = {
"kcal_per_g": None,  # empty outcome
"neocortex": nc_seq,  # sequence of neocortex
"mass": torch.tensor(4.5).repeat(30)  # average mass
}
mu = pred_m6_14.mean(0)
mu_PI = stats.pi(pred_m6_14, 0.89, dim=0)

# plot it all
fig, ax = sns.mpl.pyplot.subplots()
sns.scatterplot("neocortex", "kcal.per.g", data=d, s=80)
sns.lineplot(nc_seq, mu, color="k", linestyle="--")
ax.lines[-1].set_linestyle("--")
sns.lineplot(nc_seq, mu_PI[0], color="k", linestyle="--")
ax.lines[-1].set_linestyle("--")
sns.lineplot(nc_seq, mu_PI[1], color="k", linestyle="--")
ax.lines[-1].set_linestyle("--")


### Code 6.30¶

In [29]:
milk_ensemble = ensemble({"m6.11": m6_11, "m6.12": m6_12,
"m6.13": m6_13, "m6.14": m6_14}, data=d_predict)
sns.lineplot(nc_seq, mu, color="k", ax=ax)
ax.fill_between(nc_seq, mu_PI[0], mu_PI[1], color="k", alpha=0.2)
fig

Out[29]:

### Code 6.31¶

In [30]:
howell1 = pd.read_csv("../data/Howell1.csv", sep=";")
d = howell1
d["age"] = (d["age"] - d["age"].mean()) / d["age"].std()
torch.manual_seed(1000)
i = torch.multinomial(torch.ones(d.shape[0]), num_samples=(d.shape[0] // 2)).tolist()
d1 = d.iloc[i]
d2 = d.drop(index=i)