#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import pymc3 as pm import seaborn as sns # ## Import and clean data # In[2]: kawasaki_raw = pd.read_csv('kd_treatment_response.csv', na_values=[' '], parse_dates=['dob', 'Date_admission']) # In[27]: kawasaki_raw.assign(responder=(kawasaki_raw.Responder==1)).groupby('Race2').responder.mean() # In[28]: kawasaki_raw.head() # Calculate age from admission year and birth year # In[4]: kawasaki_raw['age_months'] = (kawasaki_raw.Date_admission - kawasaki_raw.dob).dt.days/30. kawasaki_raw['age_months'].head() # In[5]: kawasaki_raw['age_years'] = (kawasaki_raw.age_months/12.).astype(int) # In[6]: kawasaki_raw['age_norm'] = ((kawasaki_raw.age_months - kawasaki_raw.age_months.mean()) /kawasaki_raw.age_months.std()) # In[7]: kawasaki_raw['age_norm2'] = kawasaki_raw['age_norm']**2 # In[8]: relevant_cols = ['StudyID', 'Responder', 'sex', 'age_norm', 'age_norm2', 'age_years', 'Race2', 'ethnicity', 'KD_complete', 'Tier'] # In[9]: kawasaki_subset = (kawasaki_raw[relevant_cols] .rename(columns={'sex':'male'}) .assign(responder=(kawasaki_raw.Responder==1).astype(int), black=(kawasaki_raw.Race2==2).astype(int), other=(kawasaki_raw.Race2==3).astype(int)) .drop(['Responder', 'Race2'], axis=1)) kawasaki_subset.head() # In[23]: kawasaki_subset.groupby('black').responder.mean() # Probability of responder does not seem to vary with age. Wide variation at the end is mostly a sample size artefact. # In[10]: ax = (kawasaki_subset[kawasaki_subset.age_years<10] .assign(age_int=kawasaki_subset.age_years.astype(int)) .groupby('age_int')).responder.mean().plot() ax.set_xlabel('age') responders = kawasaki_subset[kawasaki_subset.responder==1] ax.plot(responders.age_years, np.random.normal(0.9, 0.005, size=len(responders)), 'go', alpha=0.3) nonresponders = kawasaki_subset[kawasaki_subset.responder==0] ax.plot(nonresponders.age_years, np.random.normal(0.6, 0.005, size=len(nonresponders)), 'ro', alpha=0.3) # In[11]: kawasaki_subset.isnull().sum() # ## Model specification # In[12]: def interpolate(x0,y0, x): x = np.array(x) idx = np.searchsorted(x0, x) dl = np.array(x - x0[idx - 1]) dr = np.array(x0[idx] - x) d=dl+dr wl = dr/d return wl*y0[idx-1] + (1-wl)*y0[idx] # In[13]: import theano.tensor as tt GaussianRandomWalk = pm.distributions.timeseries.GaussianRandomWalk covs = ['male', 'black', 'other'] k = len(covs) nknots = 7 knots = np.linspace(kawasaki_subset.age_norm.min(), kawasaki_subset.age_norm.max(), nknots) with pm.Model() as model: # Baseline probability of response θ = pm.Normal('θ', 0, sd=10) # Coefficients for covariates β = pm.Normal('β', 0, sd=10, shape=k) odds = pm.Deterministic('odds', tt.exp(β)) # Age effect (non-linear) σ = pm.HalfCauchy('σ', 2.5) y = GaussianRandomWalk('y', sd=σ, shape=nknots) α = pm.Deterministic('α', interpolate(knots, y, kawasaki_subset.age_norm)) # Probabilities p_race_female = pm.Deterministic('p_race_female', pm.invlogit(θ)) p_race_male = pm.Deterministic('p_race_male', pm.invlogit(θ + β[1])) p_race_black_female = pm.Deterministic('p_race_black_female', pm.invlogit(θ + β[1])) p_race_black_male = pm.Deterministic('p_race_black_male', pm.invlogit(θ + β[0] + β[1])) π = pm.invlogit(θ + tt.dot(kawasaki_subset[covs].values, β)+ α*kawasaki_subset.age_norm.values) responder = pm.Bernoulli('responder', π, observed=kawasaki_subset.responder) # In[14]: with model: advi_fit = pm.variational.advi(n=100000) # In[17]: iterations = 2000 burn = 1000 # In[19]: with model: trace = pm.sample(iterations, start=advi_fit[0], njobs=2) # Odds ratios for responder. Tabulated values below plot. # In[20]: pm.forestplot(trace[burn:], varnames=['odds'], ylabels=covs, vline=1, main='Odds ratios of covariates') # Age effect # In[21]: pm.forestplot(trace[burn:], varnames=['y']) # In[22]: pm.summary(trace[burn:], varnames=['odds'])