In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from numpy.ma import masked_values

from pymc3 import Model, sample, Metropolis, NUTS, traceplot, forestplot, find_MAP
from pymc3 import Bernoulli, Normal, Uniform, invlogit, log, exp, Deterministic, DensityDist
from pymc3 import Dirichlet, Categorical, Poisson, Lognormal, HalfCauchy, Gamma
In [2]:
dataset = pd.read_csv("rtw.csv", index_col=0).reset_index(drop=True)
dataset.head()
Out[2]:
age1 age2 age3 bmi1 bmi2 bmi3 gender race hispanic education ... los rtw day practice.factor pracind practice select2 smoke day0b day1
0 -1.097827 0.031377 0.000000 1.074295 1.327160 0.279160 0 2 -1 4 ... 1.0 1 51 AHN 0 AHN False 2 51 0
1 -0.304202 0.309668 0.000002 0.934801 1.142968 0.224315 1 0 -1 3 ... 1.0 1 43 AHN 0 AHN False 2 43 0
2 -1.160901 0.023567 0.000000 -0.620389 0.043296 0.000000 0 0 -1 2 ... 1.0 1 7 AHN 0 AHN False 2 7 0
3 -1.188626 0.020591 0.000000 -0.029360 0.240722 0.008388 0 0 -1 1 ... 1.0 1 83 AHN 0 AHN False 2 83 0
4 -0.988314 0.048724 0.000000 0.093370 0.312118 0.017463 0 0 -1 3 ... 1.0 1 46 AHN 0 AHN False 2 46 0

5 rows × 53 columns

In [3]:
failure = (dataset.day1.values==0).astype(int)
t = dataset.day0b.replace(500, np.nan).combine_first(dataset.day1).values + 1
In [4]:
beta_names = ['age1', 'age2', 'age3', 'bmi1', 'bmi2', 'bmi3', 'osteoporosis', 'cad',  'gender', 
       'hispanic', 'compensation', 'pastsurgery', 'diabetes', 'anxiety', 'depression', 
        'motordef', 'duration', 'arthrodesis', 'interbody', 'approach']
In [5]:
n_prac = dataset.pracind.unique().shape[0]
In [6]:
assert not np.isnan(t).sum()
assert not np.isnan(failure).sum()
In [7]:
with Model() as weibull_surv:
    
    mu_int = Normal('mu_int', 0, sd=100)
    sigma_int = HalfCauchy('sigma_int', 5)
    intercept = Normal('intercept', mu_int, sigma_int, shape=n_prac)
    
    beta = Normal('beta', 0, sd=1000, shape=len(beta_names))
    X = dataset[beta_names].values.T
    
    theta = exp(intercept[dataset.pracind.values] + beta.dot(X))
    
    alpha = HalfCauchy("alpha", 2.5)
    
    
    # Weibull survival likelihood, accounting for censoring
    def logp(failure, value):
        hazard = alpha * theta**alpha * value**(alpha-1)
        return (failure * log(hazard) - (theta * value)**alpha).sum()

    day = DensityDist('day', logp, observed={'failure':failure, 'value':t})
Applied log-transform to sigma_int and added transformed sigma_int_log_ to model.
Applied log-transform to alpha and added transformed alpha_log_ to model.
In [13]:
from pymc3.variational import advi, sample_vp

with weibull_surv:
    
    fit = advi(n=100000)
    trace = sample_vp(fit)
Iteration 0 [0%]: ELBO = -3.2675045052572436e+18
Iteration 10000 [10%]: Average ELBO = -1.6609694956981292e+208
Iteration 20000 [20%]: Average ELBO = -31807.88
Iteration 30000 [30%]: Average ELBO = -28720.5
Iteration 40000 [40%]: Average ELBO = -28061.28
Iteration 50000 [50%]: Average ELBO = -27823.17
Iteration 60000 [60%]: Average ELBO = -27716.09
Iteration 70000 [70%]: Average ELBO = -27672.36
Iteration 80000 [80%]: Average ELBO = -27652.08
Iteration 90000 [90%]: Average ELBO = -27642.41
Finished [100%]: Average ELBO = -27637.48
In [15]:
forestplot(trace, varnames=['beta'], ylabels=beta_names)
Out[15]:
<matplotlib.gridspec.GridSpec at 0x1264ef780>
In [16]:
traceplot(trace, varnames=['mu_int', 'sigma_int'])
Out[16]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x126891438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1268c9b70>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x126b17710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x126b4eef0>]], dtype=object)