Chapter 18 - Metric Predicted Variable with Multiple Metric Predictors

In [1]:
# %load ../../standard_import.txt
import pandas as pd
import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import gridspec
from IPython.display import Image

%matplotlib inline
plt.style.use('seaborn-white')

color = '#87ceeb'
f_dict = {'size':16}

18.1 - Multiple Linear Regression

Data

In [2]:
df = pd.read_csv('data/Guber1999data.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
State        50 non-null object
Spend        50 non-null float64
StuTeaRat    50 non-null float64
Salary       50 non-null float64
PrcntTake    50 non-null int64
SATV         50 non-null int64
SATM         50 non-null int64
SATT         50 non-null int64
dtypes: float64(3), int64(4), object(1)
memory usage: 3.2+ KB
In [3]:
df.head()
Out[3]:
State Spend StuTeaRat Salary PrcntTake SATV SATM SATT
0 Alabama 4.405 17.2 31.144 8 491 538 1029
1 Alaska 8.963 17.6 47.951 47 445 489 934
2 Arizona 4.778 19.3 32.175 27 448 496 944
3 Arkansas 4.459 17.1 28.934 6 482 523 1005
4 California 4.992 24.0 41.078 45 417 485 902
In [4]:
X = df[['Spend', 'PrcntTake']]
y = df['SATT']

meanx = X.mean().values
scalex = X.std().values
zX = ((X-meanx)/scalex).values

meany = y.mean()
scaley = y.std()
zy = ((y-meany)/scaley).values

Model (Kruschke, 2015)

In [5]:
Image('images/fig18_4.png', width=400)
Out[5]:
In [6]:
with pm.Model() as model:
    
    zbeta0 = pm.Normal('zbeta0', mu=0, sd=2)
    zbetaj = pm.Normal('zbetaj', mu=0, sd=2, shape=(2))
    zmu =  zbeta0 + pm.math.dot(zbetaj, zX.T)
        
    nu = pm.Exponential('nu', 1/29.)
    zsigma = pm.Uniform('zsigma', 10**-5, 10)
    
    likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy)
In [7]:
with model:
    trace = pm.sample(10000)
Auto-assigning NUTS sampler...
Initializing NUTS using ADVI...
Average Loss = 42.951: 100%|██████████| 200000/200000 [00:25<00:00, 7746.77it/s]
Finished [100%]: Average Loss = 42.951
 99%|█████████▉| 10432/10500 [00:14<00:00, 726.14it/s]/Users/jordi/anaconda/envs/probalistic/lib/python3.5/site-packages/pymc3/step_methods/hmc/nuts.py:247: UserWarning: Chain 0 contains diverging samples after tuning. If increasing `target_accept` doesn't help, try to reparameterize.
  "try to reparameterize." % chain)
100%|██████████| 10500/10500 [00:14<00:00, 727.92it/s]
In [8]:
pm.traceplot(trace);

Figure 18.5

In [9]:
# Transform parameters back to original scale
burnin = 100
beta0 = trace['zbeta0']*scaley + meany - np.sum(trace['zbetaj']*meanx/scalex, axis=1)*scaley
betaj = (trace['zbetaj']/scalex)*scaley
scale = (trace['zsigma']*scaley)[burnin:]

intercept = beta0[burnin:]
spend = betaj[:,0][burnin:]
prcnttake =  betaj[:,1][burnin:]
normality = np.log10(trace['nu'][burnin:])

fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2,3, figsize=(12,6))

for ax, estimate, title, xlabel in zip(fig.axes,
                               [intercept, spend, prcnttake, scale, normality],
                               ['Intercept', 'Spend', 'PrcntTake', 'Scale', 'Normality'],
                               [r'$\beta_0$', r'$\beta_1$', r'$\beta_2$', r'$\sigma$', r'log10($\nu$)']):
    pm.plot_posterior(estimate, point_estimate='mode', ax=ax, color=color)
    ax.set_title(title, fontdict=f_dict)
    ax.set_xlabel(xlabel, fontdict=f_dict)

fig.tight_layout()
ax6.set_visible(False)

Below we create the scatterplots of figure 18.5 using pairplot() in seaborn and then tweak the lower triangle.

In [10]:
# DataFrame with the columns in correct order
pair_plt = pd.DataFrame({'Intercept':intercept,
                         'Spend':spend,
                         'PrcntTake': prcnttake,
                         'Scale':scale,
                         'Normality': normality},
                        columns=['Intercept', 'Spend', 'PrcntTake', 'Scale', 'Normality'])

# Correlation coefficients
corr = np.round(np.corrcoef(pair_plt, rowvar=0), decimals=3)
# Indexes of the lower triangle, below the diagonal
lower_idx = np.tril(corr, -1).nonzero()

# The seaborn pairplot
pgrid = sns.pairplot(pair_plt, plot_kws={'edgecolor':color, 'facecolor':'none'})

# Replace the plots on the diagonal with the parameter names
for i, ax in enumerate(pgrid.diag_axes):
    ax.clear()
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.text(.5,.5, pair_plt.columns[i], transform=ax.transAxes, fontdict={'size':16, 'weight':'bold'}, ha='center') 

# Replace the lower triangle with the correlation coefficients
for i, ax in enumerate(pgrid.axes[lower_idx]):
    ax.clear()
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.text(.5,.5, corr[lower_idx][i], transform=ax.transAxes, fontdict=f_dict, ha='center')

18.1.4 - Redundant predictors

In [11]:
X2 = X.assign(PropNotTake = lambda x: (100-x.PrcntTake)/100)

meanx2 = X2.mean().values
scalex2 = X2.std().values
zX2 = ((X2-meanx2)/scalex2).values
In [12]:
with pm.Model() as model2:
    
    zbeta0 = pm.Normal('zbeta0', mu=0, sd=2)
    zbetaj = pm.Normal('zbetaj', mu=0, sd=2, shape=(3))
    zmu =  zbeta0 + pm.math.dot(zbetaj, zX2.T)
        
    nu = pm.Exponential('nu', 1/29.)
    zsigma = pm.Uniform('zsigma', 10**-5, 10)
    
    likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy)    
In [13]:
with model2:
    trace2 = pm.sample(5000)
Auto-assigning NUTS sampler...
Initializing NUTS using ADVI...
Average Loss = 46.382: 100%|██████████| 200000/200000 [00:25<00:00, 7984.08it/s]
Finished [100%]: Average Loss = 46.369
100%|██████████| 5500/5500 [00:24<00:00, 221.48it/s]
In [14]:
pm.traceplot(trace2);

Figure 18.6

In [15]:
# Transform parameters back to original scale
burnin = 200
beta0 = trace2['zbeta0']*scaley + meany - np.sum(trace2['zbetaj']*meanx2/scalex2, axis=1)*scaley
betaj = (trace2['zbetaj']/scalex2)*scaley
scale = (trace2['zsigma']*scaley)[burnin:]

intercept = beta0[burnin:]
spend = betaj[:,0][burnin:]
prcnttake =  betaj[:,1][burnin:]
propnottake =  betaj[:,2][burnin:]
normality = np.log10(trace2['nu'][burnin:])

fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2,3, figsize=(12,6))

for ax, estimate, title, xlabel in zip(fig.axes,
                               [intercept, spend, prcnttake, propnottake, scale, normality],
                               ['Intercept', 'Spend', 'PrcntTake', 'PropNotTake', 'Scale', 'Normality'],
                               [r'$\beta_0$', r'$\beta_1$', r'$\beta_2$', r'$\beta_3$', r'$\sigma$', r'log10($\nu$)']):
    pm.plot_posterior(estimate, point_estimate='mode', ax=ax, color=color)
    ax.set_title(title, fontdict=f_dict)
    ax.set_xlabel(xlabel, fontdict=f_dict)

plt.tight_layout()
In [17]:
# DataFrame with the columns in correct order
pair_plt = pd.DataFrame({'Intercept':intercept,
                         'Spend':spend,
                         'PrcntTake': prcnttake,
                         'PropNotTake': propnottake,
                         'Scale':scale,
                         'Normality': normality},
                        columns=['Intercept', 'Spend', 'PrcntTake', 'PropNotTake', 'Scale', 'Normality'])

# Correlation coefficients
corr = np.round(np.corrcoef(pair_plt, rowvar=0), decimals=3)
# Indexes of the lower triangle, below the diagonal
lower_idx = np.tril(corr, -1).nonzero()

# The seaborn pairplot
pgrid = sns.pairplot(pair_plt, plot_kws={'edgecolor':color, 'facecolor':'none'})

# Replace the plots on the diagonal with the parameter names
for i, ax in enumerate(pgrid.diag_axes):
    ax.clear()
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.text(.5,.5, pair_plt.columns[i], transform=ax.transAxes, fontdict={'size':16, 'weight':'bold'}, ha='center') 

# Replace the lower triangle with the correlation coefficients
for i, ax in enumerate(pgrid.axes[lower_idx]):
    ax.clear()
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.text(.5,.5, corr[lower_idx][i], transform=ax.transAxes, fontdict=f_dict, ha='center')