%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc as pm
vlbw = pd.read_csv("vlbw.csv", index_col=0)
vlbw.head()
birth | exit | hospstay | lowph | pltct | race | bwt | gest | inout | twn | ... | vent | pneumo | pda | cld | pvh | ivh | ipe | year | sex | dead | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 81.511002 | 81.603996 | 34 | NaN | 100 | white | 1250 | 35 | born at Duke | 0 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 81.511963 | female | 0 |
2 | 81.514000 | 81.539001 | 9 | 7.250000 | 244 | white | 1370 | 32 | born at Duke | 0 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 81.514709 | female | 0 |
3 | 81.552002 | 81.552002 | -2 | 7.059998 | 114 | black | 620 | 23 | born at Duke | 0 | ... | 1 | 0 | 0 | NaN | NaN | NaN | NaN | 81.553040 | female | 1 |
4 | 81.557999 | 81.667000 | 40 | 7.250000 | 182 | black | 1480 | 32 | born at Duke | 0 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 81.558472 | male | 0 |
5 | 81.593002 | 81.598999 | 2 | 6.969997 | 54 | black | 925 | 28 | born at Duke | 0 | ... | 1 | 1 | 0 | 0 | definite | definite | NaN | 81.594055 | female | 1 |
5 rows × 26 columns
vlbw = vlbw.dropna(subset=['ivh', 'gest'])
vlbw.ivh.value_counts()
absent 439 definite 74 possible 10 dtype: int64
vlbw['ivh_outcome'] = vlbw.ivh.isin(['definite', 'possible'])
vlbw.ivh_outcome.value_counts()
False 439 True 84 dtype: int64
vlbw['weeks_premature'] = np.maximum(0, 37 - vlbw.gest)
from pymc.gp import *
from pymc.gp.cov_funs import matern
# Unique observed gestational ages as mesh
age_mesh = np.sort(vlbw.weeks_premature.unique())
def model_q2():
# The mean function's parameters
beta_0 = pm.Normal('beta_0', 0, 0.001, value=1)
beta_1 = pm.Normal('beta_1', 0, 0.001, value=0)
# GP hyperpriors
amp = pm.Exponential('amp', 1, value=1)
scale = pm.Uniform('scale' , 0, 10, value=1)
diff_degree = pm.Uniform('diff_degree', 0, 10, value=1)
@pm.deterministic
def C(diff_degree=diff_degree, amp=amp, scale=scale):
"""
The Matern covariance function
"""
return Covariance(matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale)
@pm.deterministic
def M(beta_0 = beta_0, beta_1 = beta_1):
"""
The mean function is a linear function of age
"""
return Mean(lambda x: beta_0 + beta_1*x)
alpha = GPSubmodel('alpha', M, C, mesh=age_mesh)
theta = pm.Lambda('theta', lambda a=alpha.f(vlbw.weeks_premature): pm.invlogit(a))
ivh = pm.Bernoulli('ivh', theta, value=vlbw.ivh_outcome, observed=True)
return(locals())
M = pm.MCMC(model_q2())
M.sample(50000, 40000)
[-----------------100%-----------------] 50000 of 50000 complete in 170.3 sec
Draw samples of age effect from posterior
x = np.linspace(vlbw.weeks_premature.min(), vlbw.weeks_premature.max(), 100)
plt.figure(figsize=(18, 5))
ax = plt.subplot(1, 2, 1)
for i in range(0, 100):
f = np.random.choice(M.alpha.f.trace()[-1000:])(x)
plt.plot(x, pm.invlogit(f), 'k-', linewidth=0.5, alpha=0.5)
plt.xlabel('Weeks premature (< 37 weeks)')
plt.ylabel('Age effect on p(IVH)')
plt.title('Posterior samples of age effect')
<matplotlib.text.Text at 0x1193d76d0>
pm.Matplot.summary_plot([M.amp, M.scale, M.diff_degree])
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Adding birth weight, sex and pneumo. Birth weight will be a GP, while the others are indicators.
vlbw.head()
birth | exit | hospstay | lowph | pltct | race | bwt | gest | inout | twn | ... | pda | cld | pvh | ivh | ipe | year | sex | dead | ivh_outcome | weeks_premature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5 | 81.593002 | 81.598999 | 2 | 6.969997 | 54 | black | 925 | 28 | born at Duke | 0 | ... | 0 | 0 | definite | definite | NaN | 81.594055 | female | 1 | True | 9 |
6 | 81.601997 | 81.771004 | 62 | 7.189999 | NaN | white | 940 | 28 | born at Duke | 0 | ... | 0 | 0 | absent | absent | absent | 81.602295 | female | 0 | False | 9 |
9 | NaN | NaN | NaN | NaN | NaN | NaN | 700 | 24 | born at Duke | NaN | ... | NaN | NaN | absent | absent | absent | NaN | NaN | 1 | False | 13 |
13 | 81.683998 | 81.853996 | 62 | 7.179996 | 182 | black | 1110 | 28 | born at Duke | 0 | ... | 0 | 1 | absent | absent | absent | 81.684448 | male | 0 | False | 9 |
14 | 81.689003 | 81.877998 | 69 | 7.419998 | 361 | white | 1180 | 28 | born at Duke | 0 | ... | 0 | 0 | absent | absent | absent | 81.689880 | male | 0 | False | 9 |
5 rows × 28 columns
vlbw = vlbw.dropna(subset=['ivh', 'gest', 'bwt', 'pneumo', 'sex'])
vlbw['male'] = vlbw.sex.replace({'female':0, 'male':1})
-c:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Normalize birth weights
vlbw['bwt_norm'] = (vlbw.bwt - vlbw.bwt.mean()) / vlbw.bwt.std()
X = vlbw[['male', 'pneumo']].values
from pymc.gp import *
from pymc.gp.cov_funs import matern
# Unique observed gestational ages as mesh
obs_mesh = list({tuple(pair) for pair in vlbw[['weeks_premature','bwt_norm']].values})
def model_q3():
# Overall mean
mu = pm.Normal('mu', 0, 1e-5, value=0)
# Mean function, which is constant
def constant(x, val):
return np.zeros(x.shape[:-1], dtype=float) + val
# Wrap mean function for GP submodel
@pm.deterministic
def M(mu=mu):
return Mean(constant, val=mu)
# GP hyperpriors
amp = pm.Exponential('amp', 1, value=1)
scale = pm.Uniform('scale' , 0, 10, value=1)
diff_degree = pm.Uniform('diff_degree', 0, 10, value=1)
@pm.deterministic
def C(diff_degree=diff_degree, amp=amp, scale=scale):
"""
The Matern covariance function
"""
return Covariance(matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale)
alpha = GPSubmodel('alpha', M, C, mesh=obs_mesh)
beta = pm.Normal('beta', 0, 0.001, value=[0,0])
theta = pm.Lambda('theta', lambda a=alpha.f(vlbw[['weeks_premature', 'bwt_norm']].values),
b=beta: pm.invlogit(a + X.dot(b)))
ivh = pm.Bernoulli('ivh', theta, value=vlbw.ivh_outcome.values, observed=True)
return(locals())
M2 = pm.MCMC(model_q3())
M2.sample(10000, 5000)
[-----------------100%-----------------] 10000 of 10000 complete in 584.1 sec
Useful to visualize mean and variance across landscape of two variables
n = 1000
m = 100
xmin, xmax = vlbw.weeks_premature.min(), vlbw.weeks_premature.max()
ymin, ymax = vlbw.bwt_norm.min(), vlbw.bwt_norm.max()
xplot = np.linspace(xmin, xmax, m)
yplot = np.linspace(ymin, ymax, m)
dplot = np.dstack(np.meshgrid(xplot, yplot))
Msurf = np.zeros(dplot.shape[:2])
E2surf = np.zeros(dplot.shape[:2])
from pymc.gp import point_eval
# Get E[v] and E[v**2] over the entire posterior
for i in xrange(n):
# Reset all variables to their values at frame i of the trace
M2.remember(0, i)
# Evaluate the observed mean
Msurf_i, Vsurf_i = point_eval(M2.alpha.M_obs.value, M2.alpha.C_obs.value, dplot)
Msurf += Msurf_i/n
# Evaluate the observed covariance with one argument
E2surf += (Vsurf_i + Msurf_i**2)/n
# Get the posterior variance and standard deviation
Vsurf = E2surf - Msurf**2
SDsurf = np.sqrt(Vsurf)
Surface of means
plt.figure(figsize=(10,6))
plt.imshow(-1*Msurf, extent=[xmin, xmax, ymin, ymax], interpolation='nearest', origin='lower', cmap='autumn')
plt.plot(vlbw.weeks_premature, vlbw.bwt_norm,'ko',markersize=4, alpha=0.4)
plt.axis([xmin, xmax, ymin, ymax])
plt.title('Posterior predictive mean surface')
plt.ylabel('Weeks premature')
plt.xlabel('Standardized birth weight')
plt.colorbar();
Surface of standard deviations
plt.figure(figsize=(10,6))
plt.imshow(SDsurf, extent=[xmin, xmax, ymin, ymax],interpolation='nearest', origin='lower', cmap='autumn')
plt.plot(vlbw.weeks_premature, vlbw.bwt_norm,'ko',markersize=4)
plt.axis([xmin, xmax, ymin, ymax])
plt.title('Posterior predictive standard deviation surface')
plt.ylabel('Weeks premature')
plt.xlabel('Standardized birth weight')
plt.colorbar();
Covariates for sex and pneumo
pm.Matplot.plot(M2.beta)
Plotting beta_0 Plotting beta_1