#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd
import pymc as pm
from scipy.misc import comb


# In[2]:


np.random.seed(42)


# ## Data import and cleaning

# In[3]:


excel_sucks = True

if excel_sucks:

    study_char = pd.read_csv("study_characteristics.csv", 
                             index_col='RefID', na_values=['-', 'NR'])
    outcomes = pd.read_csv("outcomes.csv", na_values=['ND', 'NR'])
    
else:
    
    data_file = 'DBD Data for Meta Analyses_4-7-15_withNRCTs_RAE.xlsx'

    study_char = pd.read_excel(data_file, "Study Characteristics", 
                               index_col='RefID', na_values=['-', 'NR'])
    outcomes = pd.read_excel(data_file, "Outcomes", 
                               na_values=['ND', 'NR'])


# In[4]:


study_char.tail()


# Data cleaning

# In[5]:


# Cast outcomes variables to floats
for col in ('Last FU Mean', 'Last FU SD',):
    outcomes[col] = outcomes[col].astype(float)


# In[6]:


# Recode age category
study_char['age_cat'] = study_char.AgeCat.replace({'PRE-K':1, 'SCHOOL':0, 'TEEN':2})


# In[7]:


# Fix data label typo
outcomes['Measure Instrument'] = outcomes['Measure Instrument'].replace({'Eyberg Child Behaviour Inventory, problem Subscale': 
                                        'Eyberg Child Behaviour Inventory, Problem Subscale'})
outcomes.Units = outcomes.Units.replace({'scale': 'Scale'})


# In[8]:


# Parse followup times and convert to months
split_fut = outcomes.loc[outcomes['Last FU Time'].notnull(), 'Last FU Time'].apply(lambda x: str(x).split(' ')[:2])
fut_months = [float(time)/52.*(unit=='weeks') or float(time) for time, unit in split_fut]
outcomes.loc[outcomes['Last FU Time'].notnull(), 'Last FU Time'] = fut_months


# We are assumung all CBC Externalizing values over 50 are T-scores, and those under 50 are raw scores. This recodes those observations.

# In[9]:


cbce_ind = outcomes['Measure Instrument'].apply(lambda x: x.startswith('Child Behavior Checklist, Externalizing'))
under_50 = outcomes['BL Mean']<50
outcomes.loc[cbce_ind & (under_50^True), 'Measure Instrument'] = 'Child Behavior Checklist, Externalizing (T Score)'
outcomes.loc[cbce_ind & under_50, 'Measure Instrument'] = 'Child Behavior Checklist, Externalizing'


# Recode measure instrument variables

# In[10]:


instrument = []
subtype = []
units = []

for i,row in outcomes.iterrows():
    separator = row['Measure Instrument'].find(',')
    if separator == -1:
        separator = row['Measure Instrument'].find('-')
    instrument.append(row['Measure Instrument'][:separator])
    s = row['Measure Instrument'][separator+2:]
    paren = s.find('(')
    if paren > -1:
        subtype.append(s[:paren-1])
        units.append(s[paren+1:-1])
    else:
        subtype.append(s)
        if s.endswith('scale'):
            units.append('Scale')
        else:
            units.append('Score')
            
new_cols = pd.DataFrame({'instrument': instrument, 'subtype': subtype, 
                         'units': units}, index=outcomes.index)


# In[11]:


outcomes['Measure Instrument'].value_counts()


# In[12]:


new_cols.head()


# In[13]:


# Append new columns
outcomes = outcomes.join(new_cols)


# In[14]:


outcomes.intvn.value_counts()


# In[15]:


outcomes['Ref ID'].unique()


# ## Data summaries
# 
# Cross-tabulation of the outcome counts by measure instrument

# In[16]:


pd.crosstab(outcomes['instrument'], outcomes['Outcome'])


# Distribution of age categories

# In[17]:


study_char.AgeCat.value_counts()


# Frequencies of various intervention types

# In[18]:


study_char['Intervention Type'].value_counts()


# ## Extract variables of interest and merge tables

# In[19]:


KQ1 = study_char[study_char.KQ=='KQ1']


# In[20]:


study_varnames = ['Year', 'age_cat', 'Geographic setting', 'Age mean (years) ', 'Age SD (years)', 
              'Age min (years)', 'Age max (years)', 'Proportion Male (%)', 'Design']

study_vars = KQ1[study_varnames].rename(columns={'Geographic setting': 'country', 
                           'Age mean (years) ': 'age_mean', 
                           'Age SD (years)': 'age_sd', 
                           'Age min (years)': 'age_min', 
                           'Age max (years)': 'age_max', 
                           'Proportion Male (%)': 'p_male'})


# In[21]:


study_vars.head()


# In[22]:


study_vars.p_male.hist()


# Proportion missing

# In[23]:


study_vars.isnull().mean(0).round(2)


# Will assume the mean age for those which are missing is simply the midpoint between minimum and maximum values

# In[24]:


est_means = study_vars.apply(lambda x: x.age_min + (x.age_max - x.age_min) / 2, axis=1)[study_vars.age_mean.isnull()]
study_vars.loc[study_vars.age_mean.isnull(), 'age_mean'] = est_means

study_vars.age_mean.isnull().sum()


# In[25]:


outcomes_varnames = ['Ref ID', 'Measure Instrument', 'instrument', 'subtype', 'units', 
                     'intvn', 'cc', 'pc', 'fc',
                     'BL N', 'BL Mean', 'BL SD', 
                     'EOT \nN', 'EOT Mean', 'EOT \nSD', 'Last FU Time', 'Last FU N', 
                     'Last FU Mean', 'Last FU SD', 'CS Group N', 'CS Mean', 'CS SD']


# In[26]:


outcomes_vars = outcomes[outcomes_varnames].rename(columns={'Ref ID': 'RefID', 
                                                                       'Measure Instrument': 'measure_instrument',
                                                                       'cc': 'child_component',
                                                                       'pc': 'parent_component',
                                                                       'fc': 'family_component',
                                                                       'oc': 'other_component',
                                                                       'BL N': 'baseline_n',
                                                                       'BL Mean': 'baseline_mean',
                                                                       'BL SD': 'baseline_sd', 
                                                                       'EOT \nN': 'end_treat_n', 
                                                                       'EOT Mean': 'end_treat_mean', 
                                                                       'EOT \nSD': 'end_treat_sd', 
                                                                       'Last FU Time': 'followup_time', 
                                                                       'Last FU N': 'followup_n',
                                                                       'Last FU Mean': 'followup_mean', 
                                                                       'Last FU SD': 'followup_sd', 
                                                                       'CS Group N': 'change_n',
                                                                       'CS Mean': 'change_mean',
                                                                       'CS SD': 'change_sd'})


# Recode intervention clasification

# In[27]:


outcomes_vars.isnull().sum()


# In[28]:


control = ((outcomes_vars.child_component^True) & 
           (outcomes_vars.parent_component^True) & 
           (outcomes_vars.family_component^True)).astype(int)
child_only = ((outcomes_vars.child_component) & 
              (outcomes_vars.parent_component^True) & 
              (outcomes_vars.family_component^True)).astype(int)
parent_only = ((outcomes_vars.child_component^True) & 
               (outcomes_vars.parent_component) & 
               (outcomes_vars.family_component^True)).astype(int)
outcomes_vars.ix[child_only.astype(bool), ['child_component', 'parent_component', 'family_component']]


# In[29]:


multi_component = ((parent_only^True) & (child_only^True) & (control^True)).astype(int)

outcomes_vars['child_only'] = child_only
outcomes_vars['parent_only'] = parent_only
outcomes_vars['multi_component'] = multi_component


# If requested, change PCIT to parent-only.

# In[30]:


pcit_as_multicomponent = True

if not pcit_as_multicomponent:
    outcomes_vars.loc[outcomes_vars.intvn=='pcit', 'parenr_only'] = 1
    outcomes_vars.loc[outcomes_vars.intvn=='pcit', 'multi_component'] = 0


# Obtain subset with non-missing EOT data

# In[31]:


eot_subset = outcomes_vars[outcomes_vars.end_treat_mean.notnull() & outcomes_vars.end_treat_sd.notnull()].copy()


# Calculate EOT difference

# In[32]:


eot_subset['eot_diff_mean'] = eot_subset.end_treat_mean - eot_subset.baseline_mean


# In[33]:


eot_subset['eot_diff_sd'] = eot_subset.baseline_sd + eot_subset.end_treat_sd


# In[34]:


eot_subset['eot_diff_n'] = eot_subset[['baseline_n', 'end_treat_n']].min(1)


# Distribution of baseline means among outcome metrics

# In[35]:


for instrument in ('Eyberg Child Behaviour Inventory', 
                   'Child Behavior Checklist', 
                   'Strengths and Difficulties Questionnaire'):
    eot_subset[eot_subset.instrument==instrument]['baseline_mean'].hist(by=eot_subset['subtype'], 
                                                                              sharex=True)
    plt.suptitle(instrument);


# In[36]:


eot_subset.instrument.value_counts()


# In[37]:


eot_subset.instrument.value_counts()


# In[38]:


eot_subset[eot_subset.RefID==441]


# Several studies use multiple instruments and metrics within instruments

# In[39]:


eot_subset.groupby(['RefID', 'instrument'])['subtype'].value_counts()


# In[40]:


pd.crosstab(eot_subset.instrument, eot_subset.subtype)


# In[41]:


x = eot_subset[eot_subset.instrument=='Eyberg Child Behaviour Inventory']
pd.crosstab(x.instrument, x.subtype)


# In[42]:


x = eot_subset[eot_subset.instrument=='Child Behavior Checklist']
pd.crosstab(x.instrument, x.subtype)


# In[43]:


x = eot_subset[eot_subset.instrument=='Strengths and Difficulties Questionnaire']
pd.crosstab(x.instrument, x.subtype)


# Filter out non-RCT studies, if requested.

# In[44]:


rct_only = True

rct_study_vars = study_vars[study_vars.Design=='RCT']


# Merge study variables and outcomes

# In[45]:


merged_vars = rct_study_vars.merge(eot_subset, left_index=True, right_on='RefID')
merged_vars.shape


# For now, restrict to the three most prevalent metrics.

# In[46]:


merged_vars.measure_instrument.value_counts()


# Take only the top 3 measures

# In[47]:


top_measures = 3


# In[48]:


merged_vars.RefID.unique()


# In[49]:


analysis_subset = merged_vars[merged_vars.measure_instrument.isin(merged_vars.measure_instrument.value_counts().index[:top_measures])]
analysis_subset.groupby('measure_instrument')['baseline_mean'].max()


# In[50]:


analysis_subset.groupby('measure_instrument').baseline_mean.hist()


# In[51]:


analysis_subset['baseline_mean'].hist(by=analysis_subset['measure_instrument'],sharex=True)


# In[52]:


analysis_subset['eot_diff_mean'].hist(by=analysis_subset['measure_instrument'],sharex=True)


# In[53]:


for x in analysis_subset.measure_instrument.unique():
    plt.figure()
    analysis_subset[analysis_subset.measure_instrument==x].baseline_mean.plot(kind='kde', alpha=0.4, grid=False)
    analysis_subset[analysis_subset.measure_instrument==x].end_treat_mean.plot(kind='kde', alpha=0.4, grid=False)
    plt.gca().set_xlabel(x)


# Number of TAU/Control arms in subset

# In[54]:


analysis_subset.shape[0] - analysis_subset.multi_component.sum() - analysis_subset.child_only.sum() - analysis_subset.parent_only.sum()


# In[55]:


set(analysis_subset[analysis_subset.multi_component==1].RefID.values)


# In[56]:


set(analysis_subset[analysis_subset.child_only==1].RefID.values)


# In[57]:


set(analysis_subset[analysis_subset.parent_only==1].RefID.values)


# In[58]:


set(analysis_subset[(analysis_subset.parent_only==0) &
               (analysis_subset.child_only==0) &
               (analysis_subset.multi_component==0)].RefID.values)


# Total arm sample sizes

# In[59]:


analysis_subset.loc[analysis_subset.child_only==1, 'baseline_n'].sum()


# In[60]:


analysis_subset.loc[analysis_subset.parent_only==1, 'baseline_n'].sum()


# In[61]:


analysis_subset.loc[analysis_subset.multi_component==1, 'baseline_n'].sum()


# In[62]:


analysis_subset.loc[(analysis_subset.multi_component==0) & (analysis_subset.parent_only==0) & (analysis_subset.child_only==0), 'baseline_n'].sum()


# ## Meta-analysis

# Number of studies in analysis subset

# In[63]:


unique_studies = analysis_subset.RefID.unique().tolist()
len(unique_studies)


# In[64]:


analysis_subset.RefID.unique()


# In[65]:


analysis_subset.groupby(['RefID', 'measure_instrument'])['measure_instrument'].count().unstack().fillna(0)


# Study counts by 

# In[66]:


data_by_refid = analysis_subset.groupby('RefID')
data_by_refid.apply(lambda x: (x[['child_component', 'parent_component', 'multi_component']].sum()>1)).sum()


# In[67]:


data_by_refid.apply(lambda x: (x[['child_component', 'parent_component', 'multi_component']].sum(1)==0).sum()>0).astype(int)


# In[68]:


data_by_refid.apply(lambda x: x.age_cat.unique()[0]).value_counts()


# We are restricting the analysis to the 4 most prevalent measure instruments in the database.

# In[69]:


unique_measures = analysis_subset.measure_instrument.unique().tolist()
k = len(unique_measures)
unique_measures


# Three intervention components were coded:
# 
# * `child_component`
# * `parent_component`
# * `multi_component`

# In[70]:


p_male, age_cat, intvn = analysis_subset[['p_male', 'age_cat', 'intvn']].values.T
child_only, parent_only, multi_component = analysis_subset[['child_only', 'parent_only', 
                                                                       'multi_component']].values.T

change_n, change_mean, change_sd = analysis_subset[['eot_diff_n', 'eot_diff_mean', 'eot_diff_sd']].values.T


# In[71]:


school = (analysis_subset.age_cat.values==0).astype(int)
pre_k = (analysis_subset.age_cat.values==1).astype(int)
teen = (analysis_subset.age_cat.values==2).astype(int)


# The response variable is a multivariate normal of dimension k=3, for each of the measure instruments:
# 
# 1. Eyberg Child Behaviour Inventory, Intensity Subscale
# 2. Eyberg Child Behaviour Inventory, Problem Subscale
# 3. Child Behavior Checklist, Externalizing (T Score)
# 
# $$\left({
# \begin{array}{c}
#   {m_1}\\
#   {m_2}\\
#   {m_3}
# \end{array}
# }\right)_i \sim\text{MVN}(\mathbf{\mu},\Sigma)$$

# Means for each study are a draw from a multivariate normal.

# In[72]:


wishart = False

mu = pm.Normal('mu', 0, 0.001, value=[0]*k)

if wishart:
    T = pm.Wishart('T', k+1, np.eye(k), value=np.eye(k))
    
    m = [pm.MvNormal('m_{}'.format(i), mu, T, value=[0]*k) for i in range(len(unique_studies))]
else:
    sigmas = pm.Uniform('sigmas', 0, 100, value=[10]*k)
    rhos = pm.Uniform('rhos', -1, 1, value=[0]*int(comb(k, 2)))

    Sigma = pm.Lambda('Sigma', lambda s=sigmas, r=rhos: np.array([[s[0]**2, s[0]*s[1]*r[0], s[0]*s[2]*r[1]],
                                                                 [s[0]*s[1]*r[0], s[1]**2, s[1]*s[2]*r[2]],
                                                                 [s[0]*s[2]*r[1], s[1]*s[2]*r[2], s[2]**2]]))
    
    m = [pm.MvNormalCov('m_{}'.format(i), mu, Sigma, value=[0]*k) for i in range(len(unique_studies))]


# Unique intervention labels for each component; we will use these for component random effects.

# In[73]:


unique_child_intvn = np.unique(intvn[child_only.astype(bool)]).tolist()
unique_parent_intvn = np.unique(intvn[parent_only.astype(bool)]).tolist()
unique_multi_intvn = np.unique(intvn[multi_component.astype(bool)]).tolist()


# In[74]:


# Indices to random effect labels
child_component_index = [unique_child_intvn.index(x) for x in intvn[child_only.astype(bool)]]
parent_component_index = [unique_parent_intvn.index(x) for x in intvn[parent_only.astype(bool)]]
multi_component_index = [unique_multi_intvn.index(x) for x in intvn[multi_component.astype(bool)]]


# Treatment component random effects
# 
# $$X_i = \left[{
# \begin{array}{c}
#   {x_c}\\
#   {x_p}\\
#   {x_m}\\
# \end{array}
# }\right]_i$$
# 
# $$\begin{aligned}
# \beta_j^{(c)} &\sim N(\mu_{\beta}^{(c)},\tau_{\beta}^{(c)}) \\
# \beta_j^{(p)} &\sim N(\mu_{\beta}^{(p)},\tau_{\beta}^{(p)}) \\
# \beta_j^{(m)} &\sim N(\mu_{\beta}^{(m)},\tau_{\beta}^{(m)})
# \end{aligned}$$

# In[75]:


mu_beta = pm.Normal('mu_beta', 0, 0.001, value=[0]*3)
# sig_beta = pm.Uniform('sig_beta', 0, 100, value=1)
# tau_beta = sig_beta ** -2
tau_beta = pm.Gamma('tau_beta', 1, 0.1, value=1)

beta_c = pm.Normal('beta_c', mu_beta[0], tau_beta, value=[0]*len(unique_child_intvn))
beta_p = pm.Normal('beta_p', mu_beta[1], tau_beta, value=[0]*len(unique_parent_intvn))
beta_m = pm.Normal('beta_m', mu_beta[2], tau_beta, value=[0]*len(unique_multi_intvn))

b_c = pm.Lambda('b_c', lambda b=beta_c: 
             np.array([b[unique_child_intvn.index(x)] if child_only[i] else 0 for i,x in enumerate(intvn)]))
b_p = pm.Lambda('b_p', lambda b=beta_p: 
             np.array([b[unique_parent_intvn.index(x)] if parent_only[i] else 0 for i,x in enumerate(intvn)]))
b_m = pm.Lambda('b_m', lambda b=beta_m: 
             np.array([b[unique_multi_intvn.index(x)] if multi_component[i] else 0 for i,x in enumerate(intvn)]))


# Calculate the probability of being the best intervention.

# In[76]:


best = pm.Lambda('best', lambda b=mu_beta:  (b==b.min()).astype(int))


# Interaction of parent and multi-component with pre-k children.

# In[77]:


interaction = False

if interaction:
    beta_pk_p = pm.Normal('beta_pk_p', 0, 1e-5, value=0)
    beta_pk_m = pm.Normal('beta_pk_m', 0, 1e-5, value=0)
    b_pk_p = pm.Lambda('b_pk_p', lambda b=beta_pk_p: b * parent_only * pre_k)
    b_pk_m = pm.Lambda('b_pk_m', lambda b=beta_pk_m: b * multi_component * pre_k)


# In[78]:


betas = b_c + b_p + b_m 

if interaction:
    betas = betas + b_pk_p + b_pk_m


# Covariate effects of age and percent female.
# 
# $$\alpha \sim N(0, 1e5)$$

# In[79]:


alpha_age = pm.Normal('alpha_age', 0, 1e-5, value=[1,2])


# Unique study ID (`RefID`) and measure ID (`measure_instrument`) values.

# In[80]:


study_id = [unique_studies.index(x) for x in analysis_subset.RefID]
measure_id = [unique_measures.index(x) for x in analysis_subset.measure_instrument]


# Calculate expected response (treatment difference) as a function of treatment and covariates.
# 
# $$\theta_i = m_{j[i]k} + X_i \beta + \alpha x_{age}$$ 

# In[81]:


baseline_sd = analysis_subset.baseline_sd.values

@pm.deterministic
def theta(m=m, betas=betas, alpha_age=alpha_age):  

    mi = [m[i][j] for i,j in zip(study_id, measure_id)]
    
    age_effect = np.array([alpha_age[a-1] if a else 0 for a in age_cat])
    
    return(mi + baseline_sd*(betas + age_effect))


# Expected treatment effect for pre-K undergoing multi-component intervention, measused by Eyberg Child Behaviour Inventory, Intensity Subscale

# In[82]:


if wishart:
    baseline = pm.MvNormal('baseline', mu, T, value=[0]*k)
else:
    baseline = pm.MvNormalCov('baseline', mu, Sigma, value=[0]*k)


# Calculation of expected differences from baseline for each instrument.

# In[83]:


# ECBI intensity subscale
ecbi_intensity_sd = baseline_sd[np.array(measure_id)==0].mean()

prek_intensity_pred = pm.Lambda('prek_intensity_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[0] + ecbi_intensity_sd*(b + a[0]) )
school_intensity_pred = pm.Lambda('school_intensity_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[0] + ecbi_intensity_sd*b )
teen_intensity_pred = pm.Lambda('teen_intensity_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[0] + ecbi_intensity_sd*(b + a[1]) )

intensity_diff = pm.Lambda('intensity_diff', 
                           lambda base=baseline, prek=prek_intensity_pred, school=school_intensity_pred, 
                           teen=teen_intensity_pred: np.array([prek, school, teen])-base[0])
                            

# ECBI problem subscale
ecbi_problem_sd = baseline_sd[np.array(measure_id)==1].mean()

prek_problem_pred = pm.Lambda('prek_problem_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[1] + ecbi_problem_sd*(b + a[0]) )
school_problem_pred = pm.Lambda('school_problem_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[1] + ecbi_problem_sd*b )
teen_problem_pred = pm.Lambda('teen_problem_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[1] + ecbi_problem_sd*(b + a[1]) )

problem_diff = pm.Lambda('problem_diff', 
                           lambda base=baseline, prek=prek_problem_pred, school=school_problem_pred, 
                           teen=teen_problem_pred: np.array([prek, school, teen])-base[1])

# CBC T-score
cbct_sd = baseline_sd[np.array(measure_id)==2].mean()

prek_tscore_pred = pm.Lambda('prek_tscore_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[2] + cbct_sd*(b + a[0]) )
school_tscore_pred = pm.Lambda('school_tscore_pred', 
                            lambda mu=baseline, b=mu_beta: mu[2] + cbct_sd*b )
teen_tscore_pred = pm.Lambda('teen_tscore_pred', 
                            lambda mu=baseline, a=alpha_age, b=mu_beta: mu[2] + cbct_sd*(b + a[1]) )

tscore_diff = pm.Lambda('tscore_diff', 
                           lambda base=baseline, prek=prek_tscore_pred, school=school_tscore_pred, 
                           teen=teen_tscore_pred: np.array([prek, school, teen])-base[2])


# Finally, the likelihood is just a normal distribution, with the observed standard error of the treatment effect as the standard deviation of the estimates.
# 
# $$d_i \sim N(\theta_i, \hat{\sigma}^2)$$

# In[84]:


change_se = change_sd/np.sqrt(change_n)


# In[85]:


d = pm.Normal('d', theta, change_se**-2, observed=True, value=change_mean)


# Posterior predictive samples, if desired.

# In[86]:


include_gof_samples = False

if include_gof_samples:
    d_sim = pm.Normal('d_sim', theta, change_se**-2, size=len(change_mean))


# Instantiate MCMC model, and use Adaptive Metropolis for some vector-valued variables.

# In[87]:


M = pm.MCMC(locals())
M.use_step_method(pm.AdaptiveMetropolis, m)
M.use_step_method(pm.AdaptiveMetropolis, rhos)
M.use_step_method(pm.AdaptiveMetropolis, sigmas)
M.use_step_method(pm.AdaptiveMetropolis, mu_beta)
M.use_step_method(pm.AdaptiveMetropolis, [beta_c, beta_p, beta_m])


# Run two chains to allow for convergence evaluation.

# In[88]:


M.sample(500000, 490000)


# In[89]:


M.sample(500000, 490000)


# Summary of estimates of intervention components

# In[90]:


if wishart:
    pm.Matplot.plot(T)


# In[91]:


pm.Matplot.plot(mu_beta)


# In[92]:


pm.Matplot.summary_plot([mu_beta], custom_labels=['Child component', 'Parent component', 
                                                              'Multi-component'], chain=0, xlab="Effect size (standard deviations)")


# In[93]:


pm.Matplot.summary_plot([mu_beta], custom_labels=['Child component', 'Parent component', 
                                                              'Multi-component'])


# In[94]:


mu_beta.summary()


# In[95]:


mu_trace = M.mu_beta.trace()


# In[96]:


multi_child = mu_trace[:, 2] - mu_trace[:, 0]
multi_parent = mu_trace[:, 2] - mu_trace[:, 1]
parent_child = mu_trace[:, 1] - mu_trace[:, 0]


# In[97]:


'Multi-component vs child-only: mean={0:.2f}, 95% HPD=({1:.2}, {2:.2})'.format(multi_child.mean(), 
                                                                               *pm.utils.hpd(multi_child, 0.05))


# In[98]:


'Multi-component vs parent-only: mean={0:.2f}, 95% HPD=({1:.2}, {2:.2})'.format(multi_parent.mean(), 
                                                                               *pm.utils.hpd(multi_parent, 0.05))


# In[99]:


'Parent vs child-only: mean={0:.2f}, 95% HPD=({1:.2}, {2:.2})'.format(parent_child.mean(), 
                                                                               *pm.utils.hpd(parent_child, 0.05))


# In[100]:


pm.Matplot.plot(mu_beta)


# In[101]:


if interaction:
    pm.Matplot.summary_plot([beta_pk_m, beta_pk_p])


# Difference means by measure instrument.

# In[102]:


plt.figure(figsize=(24,4))
pm.Matplot.summary_plot([mu], custom_labels=unique_measures)


# In[103]:


pm.Matplot.plot(mu)


# In[104]:


if not wishart:
    plt.figure(figsize=(24,4))
    pm.Matplot.summary_plot([sigmas], custom_labels=unique_measures)


# In[105]:


sigmas.summary()


# In[106]:


pm.Matplot.plot(sigmas)


# In[107]:


if not wishart:
    pm.Matplot.summary_plot([rhos], custom_labels=['rho12', 'rho13', 'rho23'])


# Age effects for pre-k (top) and teen (bottom) groups, relative to pre-teen.

# In[108]:


pm.Matplot.summary_plot([alpha_age], custom_labels=['pre-K', 'teen'])


# In[109]:


alpha_age.summary()


# In[110]:


pm.Matplot.plot(alpha_age)


# ## Outcome Plots

# In[111]:


traces = [[school_intensity_pred, prek_intensity_pred, teen_intensity_pred],
          [school_problem_pred, prek_problem_pred, teen_problem_pred],
          [school_tscore_pred, prek_tscore_pred, teen_tscore_pred]]


# In[112]:


sb.set(style="white", palette="hot")
sb.despine(left=True)

colors = '#7fc97f','#beaed4','#fdc086','#386cb0'
#colors = '#a6611a','#dfc27d','#80cdc1','#018571'
# colors = ('#fef0d9','#fdcc8a','#fc8d59','#d7301f')[::-1]
# colors = sb.cubehelix_palette(4, start=2, rot=0, dark=.25, light=.75, reverse=False)

titles = ['Pre-K Children', 'School Children', 'Teenage Children']

for i,measure in enumerate(unique_measures):
    
    f, axes = plt.subplots(1, 3, figsize=(16, 6), sharex=True, sharey=True)
    measure_traces = traces[i]
    
    for j, trace in enumerate(np.array(measure_traces)[[1,0,2]]):
        
        x = np.random.choice(analysis_subset[analysis_subset.measure_instrument==measure].baseline_mean, 10000)
        
        c1, p1, m1 = trace.trace().T
        
        g = sb.distplot(x + c1, color=colors[0], ax=axes[j])
        sb.distplot(x + p1, color=colors[1], ax=axes[j])
        sb.distplot(x + m1, color=colors[2], ax=axes[j])
        if j:
            age_effect = alpha_age.trace()[:, j-1]
        else:
            age_effect = 0
        sb.distplot(x + baseline.trace()[:, i] + age_effect, color=colors[3], ax=axes[j]);
        g.set_title(titles[j])

        
        if not j:
            g.set_ylabel('Frequency')
            g.legend(g.lines, ['Child-only', 'Parent-only', 'Multi-component', 'Control/TAU'], loc='upper left')
        if j==1:
            g.set_xlabel(measure)


# Generate threshold proportions

# In[113]:


thresholds = [127, # ECBI, intensity
              11,  # ECBI, problem
              60]  # CBC t-score

age_labels = ['school', 'pre-k', 'teen']

estimates = []

for i,measure in enumerate(unique_measures):
    
    measure_traces = traces[i]
    cutoff = thresholds[i]
    
    for j, trace in enumerate(measure_traces):
        
        x = np.random.choice(analysis_subset[analysis_subset.measure_instrument==measure].baseline_mean, 10000)
        
        c1, p1, m1 = trace.trace().T
        
        _child_only = ((x + c1) > cutoff).mean().round(2)
        _parent_only = ((x + p1) > cutoff).mean().round(2)
        _multi_component = ((x + m1) > cutoff).mean().round(2)
        if j:
            age_effect = alpha_age.trace()[:, j-1]
        else:
            age_effect = 0
        _control = ((x + baseline.trace()[:, i] + age_effect) > cutoff).mean().round(2)
        
        print('\n{0}, {1}'.format(measure, age_labels[j]))
        print('Child-only: {0}\nParent-only: {1}\nMulti-component: {2}\nControl/TAU: {3}'.format(_child_only, 
                                                                                                 _parent_only,
                                                                                                 _multi_component,
                                                                                                 _control))
        
        estimates.append((measure, age_labels[j], _child_only, _parent_only, _multi_component, _control))
        

# In[114]:


thresholds = [127, # ECBI, intensity
              11,  # ECBI, problem
              60]  # CBC t-score

age_labels = ['school', 'pre-k', 'teen']

estimates = []

for i,measure in enumerate(unique_measures):
    
    measure_traces = traces[i]
    cutoff = thresholds[i]
    
    for j, trace in enumerate(measure_traces):
        
        x = np.random.choice(analysis_subset[analysis_subset.measure_instrument==measure].baseline_mean, 10000)
        
        c1, p1, m1 = trace.trace().T
        
        _child_only = ((x + c1) > cutoff).mean().round(2)
        _parent_only = ((x + p1) > cutoff).mean().round(2)
        _multi_component = ((x + m1) > cutoff).mean().round(2)
        if j:
            age_effect = alpha_age.trace()[:, j-1]
        else:
            age_effect = 0
        _control = ((x + baseline.trace()[:, i] + age_effect) > cutoff).mean().round(2)
        
        print('\n{0}, {1}'.format(measure, age_labels[j]))
        print('Child-only: {0}\nParent-only: {1}\nMulti-component: {2}\nControl/TAU: {3}'.format(_child_only, 
                                                                                                 _parent_only,
                                                                                                 _multi_component,
                                                                                                 _control))
        
        estimates.append((measure, age_labels[j], _child_only, _parent_only, _multi_component, _control))
        

# In[115]:


estimates


# In[116]:


estimates_filename = 'estimates%s.csv' % (bool(~pcit_as_multicomponent)*'_pcit')
np.savetxt(estimates_filename, np.array([e[2:] for e in estimates]), delimiter='\t', fmt='%1.2f')


# In[117]:


pd.DataFrame(estimates, columns=['Instrument', 'Age group', 'Child-only', 
                                 'Parent-only', 'Multi-component', 'TAU/Control']).to_csv('outcome_proportions.csv')


# Summary statistics for TAU/control

# In[118]:


baseline.summary()


# Summary statistics for intensity subscale

# In[119]:


prek_intensity_pred.summary()
school_intensity_pred.summary()
teen_intensity_pred.summary()


# Summary statistics for problem subscale

# In[120]:


prek_problem_pred.summary()
school_problem_pred.summary()
teen_problem_pred.summary()


# Summary statistics for t scores

# In[121]:


prek_tscore_pred.summary()
school_tscore_pred.summary()
teen_tscore_pred.summary()


# #### Summary statistics for differences.
# 
# Each summary corresponds to teh differences between baseline and age x component categories for each outcome. The order is:
# 
# - child-only, pre-k
# - child-only, school
# - child-only, teen
# - parent-only, pre-k
# - parent-only, school
# - parent-only, teen
# - multi, pre-k
# - multi, school
# - multi, teen
# 

# In[122]:


intensity_diff.summary()
problem_diff.summary()
tscore_diff.summary()


# In[124]:


M.best.summary()