%matplotlib inline
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns
kawasaki_raw = pd.read_csv('kd_treatment_response.csv', na_values=[' '],
parse_dates=['dob', 'Date_admission'])
kawasaki_raw.assign(responder=(kawasaki_raw.Responder==1)).groupby('Race2').responder.mean()
kawasaki_raw.head()
Calculate age from admission year and birth year
kawasaki_raw['age_months'] = (kawasaki_raw.Date_admission - kawasaki_raw.dob).dt.days/30.
kawasaki_raw['age_months'].head()
kawasaki_raw['age_years'] = (kawasaki_raw.age_months/12.).astype(int)
kawasaki_raw['age_norm'] = ((kawasaki_raw.age_months - kawasaki_raw.age_months.mean())
/kawasaki_raw.age_months.std())
kawasaki_raw['age_norm2'] = kawasaki_raw['age_norm']**2
relevant_cols = ['StudyID', 'Responder', 'sex', 'age_norm', 'age_norm2', 'age_years',
'Race2', 'ethnicity', 'KD_complete', 'Tier']
kawasaki_subset = (kawasaki_raw[relevant_cols]
.rename(columns={'sex':'male'})
.assign(responder=(kawasaki_raw.Responder==1).astype(int),
black=(kawasaki_raw.Race2==2).astype(int),
other=(kawasaki_raw.Race2==3).astype(int))
.drop(['Responder', 'Race2'], axis=1))
kawasaki_subset.head()
kawasaki_subset.groupby('black').responder.mean()
Probability of responder does not seem to vary with age. Wide variation at the end is mostly a sample size artefact.
ax = (kawasaki_subset[kawasaki_subset.age_years<10]
.assign(age_int=kawasaki_subset.age_years.astype(int))
.groupby('age_int')).responder.mean().plot()
ax.set_xlabel('age')
responders = kawasaki_subset[kawasaki_subset.responder==1]
ax.plot(responders.age_years, np.random.normal(0.9, 0.005, size=len(responders)),
'go', alpha=0.3)
nonresponders = kawasaki_subset[kawasaki_subset.responder==0]
ax.plot(nonresponders.age_years, np.random.normal(0.6, 0.005, size=len(nonresponders)),
'ro', alpha=0.3)
kawasaki_subset.isnull().sum()
def interpolate(x0,y0, x):
x = np.array(x)
idx = np.searchsorted(x0, x)
dl = np.array(x - x0[idx - 1])
dr = np.array(x0[idx] - x)
d=dl+dr
wl = dr/d
return wl*y0[idx-1] + (1-wl)*y0[idx]
import theano.tensor as tt
GaussianRandomWalk = pm.distributions.timeseries.GaussianRandomWalk
covs = ['male', 'black', 'other']
k = len(covs)
nknots = 7
knots = np.linspace(kawasaki_subset.age_norm.min(), kawasaki_subset.age_norm.max(), nknots)
with pm.Model() as model:
# Baseline probability of response
θ = pm.Normal('θ', 0, sd=10)
# Coefficients for covariates
β = pm.Normal('β', 0, sd=10, shape=k)
odds = pm.Deterministic('odds', tt.exp(β))
# Age effect (non-linear)
σ = pm.HalfCauchy('σ', 2.5)
y = GaussianRandomWalk('y', sd=σ, shape=nknots)
α = pm.Deterministic('α', interpolate(knots, y, kawasaki_subset.age_norm))
# Probabilities
p_race_female = pm.Deterministic('p_race_female', pm.invlogit(θ))
p_race_male = pm.Deterministic('p_race_male', pm.invlogit(θ + β[1]))
p_race_black_female = pm.Deterministic('p_race_black_female',
pm.invlogit(θ + β[1]))
p_race_black_male = pm.Deterministic('p_race_black_male',
pm.invlogit(θ + β[0] + β[1]))
π = pm.invlogit(θ + tt.dot(kawasaki_subset[covs].values, β)+ α*kawasaki_subset.age_norm.values)
responder = pm.Bernoulli('responder', π, observed=kawasaki_subset.responder)
with model:
advi_fit = pm.variational.advi(n=100000)
iterations = 2000
burn = 1000
with model:
trace = pm.sample(iterations, start=advi_fit[0], njobs=2)
Odds ratios for responder. Tabulated values below plot.
pm.forestplot(trace[burn:], varnames=['odds'], ylabels=covs, vline=1,
main='Odds ratios of covariates')
Age effect
pm.forestplot(trace[burn:], varnames=['y'])
pm.summary(trace[burn:], varnames=['odds'])