%matplotlib inline
import pandas as pd
import numpy as np
from numpy.ma import masked_values
from pymc3 import Model, sample, Metropolis, NUTS, traceplot, forestplot, find_MAP
from pymc3 import Bernoulli, Normal, Uniform, invlogit, log, exp, Deterministic, DensityDist
from pymc3 import Dirichlet, Categorical, Poisson, Lognormal, HalfCauchy, Gamma
dataset = pd.read_csv("rtw.csv", index_col=0).reset_index(drop=True)
dataset.head()
age1 | age2 | age3 | bmi1 | bmi2 | bmi3 | gender | race | hispanic | education | ... | los | rtw | day | practice.factor | pracind | practice | select2 | smoke | day0b | day1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.097827 | 0.031377 | 0.000000 | 1.074295 | 1.327160 | 0.279160 | 0 | 2 | -1 | 4 | ... | 1.0 | 1 | 51 | AHN | 0 | AHN | False | 2 | 51 | 0 |
1 | -0.304202 | 0.309668 | 0.000002 | 0.934801 | 1.142968 | 0.224315 | 1 | 0 | -1 | 3 | ... | 1.0 | 1 | 43 | AHN | 0 | AHN | False | 2 | 43 | 0 |
2 | -1.160901 | 0.023567 | 0.000000 | -0.620389 | 0.043296 | 0.000000 | 0 | 0 | -1 | 2 | ... | 1.0 | 1 | 7 | AHN | 0 | AHN | False | 2 | 7 | 0 |
3 | -1.188626 | 0.020591 | 0.000000 | -0.029360 | 0.240722 | 0.008388 | 0 | 0 | -1 | 1 | ... | 1.0 | 1 | 83 | AHN | 0 | AHN | False | 2 | 83 | 0 |
4 | -0.988314 | 0.048724 | 0.000000 | 0.093370 | 0.312118 | 0.017463 | 0 | 0 | -1 | 3 | ... | 1.0 | 1 | 46 | AHN | 0 | AHN | False | 2 | 46 | 0 |
5 rows × 53 columns
failure = (dataset.day1.values==0).astype(int)
t = dataset.day0b.replace(500, np.nan).combine_first(dataset.day1).values + 1
beta_names = ['age1', 'age2', 'age3', 'bmi1', 'bmi2', 'bmi3', 'osteoporosis', 'cad', 'gender',
'hispanic', 'compensation', 'pastsurgery', 'diabetes', 'anxiety', 'depression',
'motordef', 'duration', 'arthrodesis', 'interbody', 'approach']
n_prac = dataset.pracind.unique().shape[0]
assert not np.isnan(t).sum()
assert not np.isnan(failure).sum()
with Model() as weibull_surv:
mu_int = Normal('mu_int', 0, sd=100)
sigma_int = HalfCauchy('sigma_int', 5)
intercept = Normal('intercept', mu_int, sigma_int, shape=n_prac)
beta = Normal('beta', 0, sd=1000, shape=len(beta_names))
X = dataset[beta_names].values.T
theta = exp(intercept[dataset.pracind.values] + beta.dot(X))
alpha = HalfCauchy("alpha", 2.5)
# Weibull survival likelihood, accounting for censoring
def logp(failure, value):
hazard = alpha * theta**alpha * value**(alpha-1)
return (failure * log(hazard) - (theta * value)**alpha).sum()
day = DensityDist('day', logp, observed={'failure':failure, 'value':t})
Applied log-transform to sigma_int and added transformed sigma_int_log_ to model. Applied log-transform to alpha and added transformed alpha_log_ to model.
from pymc3.variational import advi, sample_vp
with weibull_surv:
fit = advi(n=100000)
trace = sample_vp(fit)
Iteration 0 [0%]: ELBO = -3.2675045052572436e+18 Iteration 10000 [10%]: Average ELBO = -1.6609694956981292e+208 Iteration 20000 [20%]: Average ELBO = -31807.88 Iteration 30000 [30%]: Average ELBO = -28720.5 Iteration 40000 [40%]: Average ELBO = -28061.28 Iteration 50000 [50%]: Average ELBO = -27823.17 Iteration 60000 [60%]: Average ELBO = -27716.09 Iteration 70000 [70%]: Average ELBO = -27672.36 Iteration 80000 [80%]: Average ELBO = -27652.08 Iteration 90000 [90%]: Average ELBO = -27642.41 Finished [100%]: Average ELBO = -27637.48
forestplot(trace, varnames=['beta'], ylabels=beta_names)
<matplotlib.gridspec.GridSpec at 0x1264ef780>
traceplot(trace, varnames=['mu_int', 'sigma_int'])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x126891438>, <matplotlib.axes._subplots.AxesSubplot object at 0x1268c9b70>], [<matplotlib.axes._subplots.AxesSubplot object at 0x126b17710>, <matplotlib.axes._subplots.AxesSubplot object at 0x126b4eef0>]], dtype=object)