#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sb import numpy as np import pandas as pd import pymc as pm from scipy.misc import comb # In[2]: np.random.seed(42) # ## Data import and cleaning # In[3]: excel_sucks = True if excel_sucks: study_char = pd.read_csv("study_characteristics.csv", index_col='RefID', na_values=['-', 'NR']) outcomes = pd.read_csv("outcomes.csv", na_values=['ND', 'NR']) else: data_file = 'DBD Data for Meta Analyses_4-7-15_withNRCTs_RAE.xlsx' study_char = pd.read_excel(data_file, "Study Characteristics", index_col='RefID', na_values=['-', 'NR']) outcomes = pd.read_excel(data_file, "Outcomes", na_values=['ND', 'NR']) # In[4]: study_char.tail() # Data cleaning # In[5]: # Cast outcomes variables to floats for col in ('Last FU Mean', 'Last FU SD',): outcomes[col] = outcomes[col].astype(float) # In[6]: # Recode age category study_char['age_cat'] = study_char.AgeCat.replace({'PRE-K':1, 'SCHOOL':0, 'TEEN':2}) # In[7]: # Fix data label typo outcomes['Measure Instrument'] = outcomes['Measure Instrument'].replace({'Eyberg Child Behaviour Inventory, problem Subscale': 'Eyberg Child Behaviour Inventory, Problem Subscale'}) outcomes.Units = outcomes.Units.replace({'scale': 'Scale'}) # In[8]: # Parse followup times and convert to months split_fut = outcomes.loc[outcomes['Last FU Time'].notnull(), 'Last FU Time'].apply(lambda x: str(x).split(' ')[:2]) fut_months = [float(time)/52.*(unit=='weeks') or float(time) for time, unit in split_fut] outcomes.loc[outcomes['Last FU Time'].notnull(), 'Last FU Time'] = fut_months # We are assumung all CBC Externalizing values over 50 are T-scores, and those under 50 are raw scores. This recodes those observations. # In[9]: cbce_ind = outcomes['Measure Instrument'].apply(lambda x: x.startswith('Child Behavior Checklist, Externalizing')) under_50 = outcomes['BL Mean']<50 outcomes.loc[cbce_ind & (under_50^True), 'Measure Instrument'] = 'Child Behavior Checklist, Externalizing (T Score)' outcomes.loc[cbce_ind & under_50, 'Measure Instrument'] = 'Child Behavior Checklist, Externalizing' # Recode measure instrument variables # In[10]: instrument = [] subtype = [] units = [] for i,row in outcomes.iterrows(): separator = row['Measure Instrument'].find(',') if separator == -1: separator = row['Measure Instrument'].find('-') instrument.append(row['Measure Instrument'][:separator]) s = row['Measure Instrument'][separator+2:] paren = s.find('(') if paren > -1: subtype.append(s[:paren-1]) units.append(s[paren+1:-1]) else: subtype.append(s) if s.endswith('scale'): units.append('Scale') else: units.append('Score') new_cols = pd.DataFrame({'instrument': instrument, 'subtype': subtype, 'units': units}, index=outcomes.index) # In[11]: outcomes['Measure Instrument'].value_counts() # In[12]: new_cols.head() # In[13]: # Append new columns outcomes = outcomes.join(new_cols) # In[14]: outcomes.intvn.value_counts() # In[15]: outcomes['Ref ID'].unique() # ## Data summaries # # Cross-tabulation of the outcome counts by measure instrument # In[16]: pd.crosstab(outcomes['instrument'], outcomes['Outcome']) # Distribution of age categories # In[17]: study_char.AgeCat.value_counts() # Frequencies of various intervention types # In[18]: study_char['Intervention Type'].value_counts() # ## Extract variables of interest and merge tables # In[19]: KQ1 = study_char[study_char.KQ=='KQ1'] # In[20]: study_varnames = ['Year', 'age_cat', 'Geographic setting', 'Age mean (years) ', 'Age SD (years)', 'Age min (years)', 'Age max (years)', 'Proportion Male (%)', 'Design'] study_vars = KQ1[study_varnames].rename(columns={'Geographic setting': 'country', 'Age mean (years) ': 'age_mean', 'Age SD (years)': 'age_sd', 'Age min (years)': 'age_min', 'Age max (years)': 'age_max', 'Proportion Male (%)': 'p_male'}) # In[21]: study_vars.head() # In[22]: study_vars.p_male.hist() # Proportion missing # In[23]: study_vars.isnull().mean(0).round(2) # Will assume the mean age for those which are missing is simply the midpoint between minimum and maximum values # In[24]: est_means = study_vars.apply(lambda x: x.age_min + (x.age_max - x.age_min) / 2, axis=1)[study_vars.age_mean.isnull()] study_vars.loc[study_vars.age_mean.isnull(), 'age_mean'] = est_means study_vars.age_mean.isnull().sum() # In[25]: outcomes_varnames = ['Ref ID', 'Measure Instrument', 'instrument', 'subtype', 'units', 'intvn', 'cc', 'pc', 'fc', 'BL N', 'BL Mean', 'BL SD', 'EOT \nN', 'EOT Mean', 'EOT \nSD', 'Last FU Time', 'Last FU N', 'Last FU Mean', 'Last FU SD', 'CS Group N', 'CS Mean', 'CS SD'] # In[26]: outcomes_vars = outcomes[outcomes_varnames].rename(columns={'Ref ID': 'RefID', 'Measure Instrument': 'measure_instrument', 'cc': 'child_component', 'pc': 'parent_component', 'fc': 'family_component', 'oc': 'other_component', 'BL N': 'baseline_n', 'BL Mean': 'baseline_mean', 'BL SD': 'baseline_sd', 'EOT \nN': 'end_treat_n', 'EOT Mean': 'end_treat_mean', 'EOT \nSD': 'end_treat_sd', 'Last FU Time': 'followup_time', 'Last FU N': 'followup_n', 'Last FU Mean': 'followup_mean', 'Last FU SD': 'followup_sd', 'CS Group N': 'change_n', 'CS Mean': 'change_mean', 'CS SD': 'change_sd'}) # Recode intervention clasification # In[27]: outcomes_vars.isnull().sum() # In[28]: control = ((outcomes_vars.child_component^True) & (outcomes_vars.parent_component^True) & (outcomes_vars.family_component^True)).astype(int) child_only = ((outcomes_vars.child_component) & (outcomes_vars.parent_component^True) & (outcomes_vars.family_component^True)).astype(int) parent_only = ((outcomes_vars.child_component^True) & (outcomes_vars.parent_component) & (outcomes_vars.family_component^True)).astype(int) outcomes_vars.ix[child_only.astype(bool), ['child_component', 'parent_component', 'family_component']] # In[29]: multi_component = ((parent_only^True) & (child_only^True) & (control^True)).astype(int) outcomes_vars['child_only'] = child_only outcomes_vars['parent_only'] = parent_only outcomes_vars['multi_component'] = multi_component # If requested, change PCIT to parent-only. # In[30]: pcit_as_multicomponent = True if not pcit_as_multicomponent: outcomes_vars.loc[outcomes_vars.intvn=='pcit', 'parenr_only'] = 1 outcomes_vars.loc[outcomes_vars.intvn=='pcit', 'multi_component'] = 0 # Obtain subset with non-missing EOT data # In[31]: eot_subset = outcomes_vars[outcomes_vars.end_treat_mean.notnull() & outcomes_vars.end_treat_sd.notnull()].copy() # Calculate EOT difference # In[32]: eot_subset['eot_diff_mean'] = eot_subset.end_treat_mean - eot_subset.baseline_mean # In[33]: eot_subset['eot_diff_sd'] = eot_subset.baseline_sd + eot_subset.end_treat_sd # In[34]: eot_subset['eot_diff_n'] = eot_subset[['baseline_n', 'end_treat_n']].min(1) # Distribution of baseline means among outcome metrics # In[35]: for instrument in ('Eyberg Child Behaviour Inventory', 'Child Behavior Checklist', 'Strengths and Difficulties Questionnaire'): eot_subset[eot_subset.instrument==instrument]['baseline_mean'].hist(by=eot_subset['subtype'], sharex=True) plt.suptitle(instrument); # In[36]: eot_subset.instrument.value_counts() # In[37]: eot_subset.instrument.value_counts() # In[38]: eot_subset[eot_subset.RefID==441] # Several studies use multiple instruments and metrics within instruments # In[39]: eot_subset.groupby(['RefID', 'instrument'])['subtype'].value_counts() # In[40]: pd.crosstab(eot_subset.instrument, eot_subset.subtype) # In[41]: x = eot_subset[eot_subset.instrument=='Eyberg Child Behaviour Inventory'] pd.crosstab(x.instrument, x.subtype) # In[42]: x = eot_subset[eot_subset.instrument=='Child Behavior Checklist'] pd.crosstab(x.instrument, x.subtype) # In[43]: x = eot_subset[eot_subset.instrument=='Strengths and Difficulties Questionnaire'] pd.crosstab(x.instrument, x.subtype) # Filter out non-RCT studies, if requested. # In[44]: rct_only = True rct_study_vars = study_vars[study_vars.Design=='RCT'] # Merge study variables and outcomes # In[45]: merged_vars = rct_study_vars.merge(eot_subset, left_index=True, right_on='RefID') merged_vars.shape # For now, restrict to the three most prevalent metrics. # In[46]: merged_vars.measure_instrument.value_counts() # Take only the top 3 measures # In[47]: top_measures = 3 # In[48]: merged_vars.RefID.unique() # In[49]: analysis_subset = merged_vars[merged_vars.measure_instrument.isin(merged_vars.measure_instrument.value_counts().index[:top_measures])] analysis_subset.groupby('measure_instrument')['baseline_mean'].max() # In[50]: analysis_subset.groupby('measure_instrument').baseline_mean.hist() # In[51]: analysis_subset['baseline_mean'].hist(by=analysis_subset['measure_instrument'],sharex=True) # In[52]: analysis_subset['eot_diff_mean'].hist(by=analysis_subset['measure_instrument'],sharex=True) # In[53]: for x in analysis_subset.measure_instrument.unique(): plt.figure() analysis_subset[analysis_subset.measure_instrument==x].baseline_mean.plot(kind='kde', alpha=0.4, grid=False) analysis_subset[analysis_subset.measure_instrument==x].end_treat_mean.plot(kind='kde', alpha=0.4, grid=False) plt.gca().set_xlabel(x) # Number of TAU/Control arms in subset # In[54]: analysis_subset.shape[0] - analysis_subset.multi_component.sum() - analysis_subset.child_only.sum() - analysis_subset.parent_only.sum() # In[55]: set(analysis_subset[analysis_subset.multi_component==1].RefID.values) # In[56]: set(analysis_subset[analysis_subset.child_only==1].RefID.values) # In[57]: set(analysis_subset[analysis_subset.parent_only==1].RefID.values) # In[58]: set(analysis_subset[(analysis_subset.parent_only==0) & (analysis_subset.child_only==0) & (analysis_subset.multi_component==0)].RefID.values) # Total arm sample sizes # In[59]: analysis_subset.loc[analysis_subset.child_only==1, 'baseline_n'].sum() # In[60]: analysis_subset.loc[analysis_subset.parent_only==1, 'baseline_n'].sum() # In[61]: analysis_subset.loc[analysis_subset.multi_component==1, 'baseline_n'].sum() # In[62]: analysis_subset.loc[(analysis_subset.multi_component==0) & (analysis_subset.parent_only==0) & (analysis_subset.child_only==0), 'baseline_n'].sum() # ## Meta-analysis # Number of studies in analysis subset # In[63]: unique_studies = analysis_subset.RefID.unique().tolist() len(unique_studies) # In[64]: analysis_subset.RefID.unique() # In[65]: analysis_subset.groupby(['RefID', 'measure_instrument'])['measure_instrument'].count().unstack().fillna(0) # Study counts by # In[66]: data_by_refid = analysis_subset.groupby('RefID') data_by_refid.apply(lambda x: (x[['child_component', 'parent_component', 'multi_component']].sum()>1)).sum() # In[67]: data_by_refid.apply(lambda x: (x[['child_component', 'parent_component', 'multi_component']].sum(1)==0).sum()>0).astype(int) # In[68]: data_by_refid.apply(lambda x: x.age_cat.unique()[0]).value_counts() # We are restricting the analysis to the 4 most prevalent measure instruments in the database. # In[69]: unique_measures = analysis_subset.measure_instrument.unique().tolist() k = len(unique_measures) unique_measures # Three intervention components were coded: # # * `child_component` # * `parent_component` # * `multi_component` # In[70]: p_male, age_cat, intvn = analysis_subset[['p_male', 'age_cat', 'intvn']].values.T child_only, parent_only, multi_component = analysis_subset[['child_only', 'parent_only', 'multi_component']].values.T change_n, change_mean, change_sd = analysis_subset[['eot_diff_n', 'eot_diff_mean', 'eot_diff_sd']].values.T # In[71]: school = (analysis_subset.age_cat.values==0).astype(int) pre_k = (analysis_subset.age_cat.values==1).astype(int) teen = (analysis_subset.age_cat.values==2).astype(int) # The response variable is a multivariate normal of dimension k=3, for each of the measure instruments: # # 1. Eyberg Child Behaviour Inventory, Intensity Subscale # 2. Eyberg Child Behaviour Inventory, Problem Subscale # 3. Child Behavior Checklist, Externalizing (T Score) # # $$\left({ # \begin{array}{c} # {m_1}\\ # {m_2}\\ # {m_3} # \end{array} # }\right)_i \sim\text{MVN}(\mathbf{\mu},\Sigma)$$ # Means for each study are a draw from a multivariate normal. # In[72]: wishart = False mu = pm.Normal('mu', 0, 0.001, value=[0]*k) if wishart: T = pm.Wishart('T', k+1, np.eye(k), value=np.eye(k)) m = [pm.MvNormal('m_{}'.format(i), mu, T, value=[0]*k) for i in range(len(unique_studies))] else: sigmas = pm.Uniform('sigmas', 0, 100, value=[10]*k) rhos = pm.Uniform('rhos', -1, 1, value=[0]*int(comb(k, 2))) Sigma = pm.Lambda('Sigma', lambda s=sigmas, r=rhos: np.array([[s[0]**2, s[0]*s[1]*r[0], s[0]*s[2]*r[1]], [s[0]*s[1]*r[0], s[1]**2, s[1]*s[2]*r[2]], [s[0]*s[2]*r[1], s[1]*s[2]*r[2], s[2]**2]])) m = [pm.MvNormalCov('m_{}'.format(i), mu, Sigma, value=[0]*k) for i in range(len(unique_studies))] # Unique intervention labels for each component; we will use these for component random effects. # In[73]: unique_child_intvn = np.unique(intvn[child_only.astype(bool)]).tolist() unique_parent_intvn = np.unique(intvn[parent_only.astype(bool)]).tolist() unique_multi_intvn = np.unique(intvn[multi_component.astype(bool)]).tolist() # In[74]: # Indices to random effect labels child_component_index = [unique_child_intvn.index(x) for x in intvn[child_only.astype(bool)]] parent_component_index = [unique_parent_intvn.index(x) for x in intvn[parent_only.astype(bool)]] multi_component_index = [unique_multi_intvn.index(x) for x in intvn[multi_component.astype(bool)]] # Treatment component random effects # # $$X_i = \left[{ # \begin{array}{c} # {x_c}\\ # {x_p}\\ # {x_m}\\ # \end{array} # }\right]_i$$ # # $$\begin{aligned} # \beta_j^{(c)} &\sim N(\mu_{\beta}^{(c)},\tau_{\beta}^{(c)}) \\ # \beta_j^{(p)} &\sim N(\mu_{\beta}^{(p)},\tau_{\beta}^{(p)}) \\ # \beta_j^{(m)} &\sim N(\mu_{\beta}^{(m)},\tau_{\beta}^{(m)}) # \end{aligned}$$ # In[75]: mu_beta = pm.Normal('mu_beta', 0, 0.001, value=[0]*3) # sig_beta = pm.Uniform('sig_beta', 0, 100, value=1) # tau_beta = sig_beta ** -2 tau_beta = pm.Gamma('tau_beta', 1, 0.1, value=1) beta_c = pm.Normal('beta_c', mu_beta[0], tau_beta, value=[0]*len(unique_child_intvn)) beta_p = pm.Normal('beta_p', mu_beta[1], tau_beta, value=[0]*len(unique_parent_intvn)) beta_m = pm.Normal('beta_m', mu_beta[2], tau_beta, value=[0]*len(unique_multi_intvn)) b_c = pm.Lambda('b_c', lambda b=beta_c: np.array([b[unique_child_intvn.index(x)] if child_only[i] else 0 for i,x in enumerate(intvn)])) b_p = pm.Lambda('b_p', lambda b=beta_p: np.array([b[unique_parent_intvn.index(x)] if parent_only[i] else 0 for i,x in enumerate(intvn)])) b_m = pm.Lambda('b_m', lambda b=beta_m: np.array([b[unique_multi_intvn.index(x)] if multi_component[i] else 0 for i,x in enumerate(intvn)])) # Calculate the probability of being the best intervention. # In[76]: best = pm.Lambda('best', lambda b=mu_beta: (b==b.min()).astype(int)) # Interaction of parent and multi-component with pre-k children. # In[77]: interaction = False if interaction: beta_pk_p = pm.Normal('beta_pk_p', 0, 1e-5, value=0) beta_pk_m = pm.Normal('beta_pk_m', 0, 1e-5, value=0) b_pk_p = pm.Lambda('b_pk_p', lambda b=beta_pk_p: b * parent_only * pre_k) b_pk_m = pm.Lambda('b_pk_m', lambda b=beta_pk_m: b * multi_component * pre_k) # In[78]: betas = b_c + b_p + b_m if interaction: betas = betas + b_pk_p + b_pk_m # Covariate effects of age and percent female. # # $$\alpha \sim N(0, 1e5)$$ # In[79]: alpha_age = pm.Normal('alpha_age', 0, 1e-5, value=[1,2]) # Unique study ID (`RefID`) and measure ID (`measure_instrument`) values. # In[80]: study_id = [unique_studies.index(x) for x in analysis_subset.RefID] measure_id = [unique_measures.index(x) for x in analysis_subset.measure_instrument] # Calculate expected response (treatment difference) as a function of treatment and covariates. # # $$\theta_i = m_{j[i]k} + X_i \beta + \alpha x_{age}$$ # In[81]: baseline_sd = analysis_subset.baseline_sd.values @pm.deterministic def theta(m=m, betas=betas, alpha_age=alpha_age): mi = [m[i][j] for i,j in zip(study_id, measure_id)] age_effect = np.array([alpha_age[a-1] if a else 0 for a in age_cat]) return(mi + baseline_sd*(betas + age_effect)) # Expected treatment effect for pre-K undergoing multi-component intervention, measused by Eyberg Child Behaviour Inventory, Intensity Subscale # In[82]: if wishart: baseline = pm.MvNormal('baseline', mu, T, value=[0]*k) else: baseline = pm.MvNormalCov('baseline', mu, Sigma, value=[0]*k) # Calculation of expected differences from baseline for each instrument. # In[83]: # ECBI intensity subscale ecbi_intensity_sd = baseline_sd[np.array(measure_id)==0].mean() prek_intensity_pred = pm.Lambda('prek_intensity_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[0] + ecbi_intensity_sd*(b + a[0]) ) school_intensity_pred = pm.Lambda('school_intensity_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[0] + ecbi_intensity_sd*b ) teen_intensity_pred = pm.Lambda('teen_intensity_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[0] + ecbi_intensity_sd*(b + a[1]) ) intensity_diff = pm.Lambda('intensity_diff', lambda base=baseline, prek=prek_intensity_pred, school=school_intensity_pred, teen=teen_intensity_pred: np.array([prek, school, teen])-base[0]) # ECBI problem subscale ecbi_problem_sd = baseline_sd[np.array(measure_id)==1].mean() prek_problem_pred = pm.Lambda('prek_problem_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[1] + ecbi_problem_sd*(b + a[0]) ) school_problem_pred = pm.Lambda('school_problem_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[1] + ecbi_problem_sd*b ) teen_problem_pred = pm.Lambda('teen_problem_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[1] + ecbi_problem_sd*(b + a[1]) ) problem_diff = pm.Lambda('problem_diff', lambda base=baseline, prek=prek_problem_pred, school=school_problem_pred, teen=teen_problem_pred: np.array([prek, school, teen])-base[1]) # CBC T-score cbct_sd = baseline_sd[np.array(measure_id)==2].mean() prek_tscore_pred = pm.Lambda('prek_tscore_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[2] + cbct_sd*(b + a[0]) ) school_tscore_pred = pm.Lambda('school_tscore_pred', lambda mu=baseline, b=mu_beta: mu[2] + cbct_sd*b ) teen_tscore_pred = pm.Lambda('teen_tscore_pred', lambda mu=baseline, a=alpha_age, b=mu_beta: mu[2] + cbct_sd*(b + a[1]) ) tscore_diff = pm.Lambda('tscore_diff', lambda base=baseline, prek=prek_tscore_pred, school=school_tscore_pred, teen=teen_tscore_pred: np.array([prek, school, teen])-base[2]) # Finally, the likelihood is just a normal distribution, with the observed standard error of the treatment effect as the standard deviation of the estimates. # # $$d_i \sim N(\theta_i, \hat{\sigma}^2)$$ # In[84]: change_se = change_sd/np.sqrt(change_n) # In[85]: d = pm.Normal('d', theta, change_se**-2, observed=True, value=change_mean) # Posterior predictive samples, if desired. # In[86]: include_gof_samples = False if include_gof_samples: d_sim = pm.Normal('d_sim', theta, change_se**-2, size=len(change_mean)) # Instantiate MCMC model, and use Adaptive Metropolis for some vector-valued variables. # In[87]: M = pm.MCMC(locals()) M.use_step_method(pm.AdaptiveMetropolis, m) M.use_step_method(pm.AdaptiveMetropolis, rhos) M.use_step_method(pm.AdaptiveMetropolis, sigmas) M.use_step_method(pm.AdaptiveMetropolis, mu_beta) M.use_step_method(pm.AdaptiveMetropolis, [beta_c, beta_p, beta_m]) # Run two chains to allow for convergence evaluation. # In[88]: M.sample(500000, 490000) # In[89]: M.sample(500000, 490000) # Summary of estimates of intervention components # In[90]: if wishart: pm.Matplot.plot(T) # In[91]: pm.Matplot.plot(mu_beta) # In[92]: pm.Matplot.summary_plot([mu_beta], custom_labels=['Child component', 'Parent component', 'Multi-component'], chain=0, xlab="Effect size (standard deviations)") # In[93]: pm.Matplot.summary_plot([mu_beta], custom_labels=['Child component', 'Parent component', 'Multi-component']) # In[94]: mu_beta.summary() # In[95]: mu_trace = M.mu_beta.trace() # In[96]: multi_child = mu_trace[:, 2] - mu_trace[:, 0] multi_parent = mu_trace[:, 2] - mu_trace[:, 1] parent_child = mu_trace[:, 1] - mu_trace[:, 0] # In[97]: 'Multi-component vs child-only: mean={0:.2f}, 95% HPD=({1:.2}, {2:.2})'.format(multi_child.mean(), *pm.utils.hpd(multi_child, 0.05)) # In[98]: 'Multi-component vs parent-only: mean={0:.2f}, 95% HPD=({1:.2}, {2:.2})'.format(multi_parent.mean(), *pm.utils.hpd(multi_parent, 0.05)) # In[99]: 'Parent vs child-only: mean={0:.2f}, 95% HPD=({1:.2}, {2:.2})'.format(parent_child.mean(), *pm.utils.hpd(parent_child, 0.05)) # In[100]: pm.Matplot.plot(mu_beta) # In[101]: if interaction: pm.Matplot.summary_plot([beta_pk_m, beta_pk_p]) # Difference means by measure instrument. # In[102]: plt.figure(figsize=(24,4)) pm.Matplot.summary_plot([mu], custom_labels=unique_measures) # In[103]: pm.Matplot.plot(mu) # In[104]: if not wishart: plt.figure(figsize=(24,4)) pm.Matplot.summary_plot([sigmas], custom_labels=unique_measures) # In[105]: sigmas.summary() # In[106]: pm.Matplot.plot(sigmas) # In[107]: if not wishart: pm.Matplot.summary_plot([rhos], custom_labels=['rho12', 'rho13', 'rho23']) # Age effects for pre-k (top) and teen (bottom) groups, relative to pre-teen. # In[108]: pm.Matplot.summary_plot([alpha_age], custom_labels=['pre-K', 'teen']) # In[109]: alpha_age.summary() # In[110]: pm.Matplot.plot(alpha_age) # ## Outcome Plots # In[111]: traces = [[school_intensity_pred, prek_intensity_pred, teen_intensity_pred], [school_problem_pred, prek_problem_pred, teen_problem_pred], [school_tscore_pred, prek_tscore_pred, teen_tscore_pred]] # In[112]: sb.set(style="white", palette="hot") sb.despine(left=True) colors = '#7fc97f','#beaed4','#fdc086','#386cb0' #colors = '#a6611a','#dfc27d','#80cdc1','#018571' # colors = ('#fef0d9','#fdcc8a','#fc8d59','#d7301f')[::-1] # colors = sb.cubehelix_palette(4, start=2, rot=0, dark=.25, light=.75, reverse=False) titles = ['Pre-K Children', 'School Children', 'Teenage Children'] for i,measure in enumerate(unique_measures): f, axes = plt.subplots(1, 3, figsize=(16, 6), sharex=True, sharey=True) measure_traces = traces[i] for j, trace in enumerate(np.array(measure_traces)[[1,0,2]]): x = np.random.choice(analysis_subset[analysis_subset.measure_instrument==measure].baseline_mean, 10000) c1, p1, m1 = trace.trace().T g = sb.distplot(x + c1, color=colors[0], ax=axes[j]) sb.distplot(x + p1, color=colors[1], ax=axes[j]) sb.distplot(x + m1, color=colors[2], ax=axes[j]) if j: age_effect = alpha_age.trace()[:, j-1] else: age_effect = 0 sb.distplot(x + baseline.trace()[:, i] + age_effect, color=colors[3], ax=axes[j]); g.set_title(titles[j]) if not j: g.set_ylabel('Frequency') g.legend(g.lines, ['Child-only', 'Parent-only', 'Multi-component', 'Control/TAU'], loc='upper left') if j==1: g.set_xlabel(measure) # Generate threshold proportions # In[113]: thresholds = [127, # ECBI, intensity 11, # ECBI, problem 60] # CBC t-score age_labels = ['school', 'pre-k', 'teen'] estimates = [] for i,measure in enumerate(unique_measures): measure_traces = traces[i] cutoff = thresholds[i] for j, trace in enumerate(measure_traces): x = np.random.choice(analysis_subset[analysis_subset.measure_instrument==measure].baseline_mean, 10000) c1, p1, m1 = trace.trace().T _child_only = ((x + c1) > cutoff).mean().round(2) _parent_only = ((x + p1) > cutoff).mean().round(2) _multi_component = ((x + m1) > cutoff).mean().round(2) if j: age_effect = alpha_age.trace()[:, j-1] else: age_effect = 0 _control = ((x + baseline.trace()[:, i] + age_effect) > cutoff).mean().round(2) print('\n{0}, {1}'.format(measure, age_labels[j])) print('Child-only: {0}\nParent-only: {1}\nMulti-component: {2}\nControl/TAU: {3}'.format(_child_only, _parent_only, _multi_component, _control)) estimates.append((measure, age_labels[j], _child_only, _parent_only, _multi_component, _control)) # In[114]: thresholds = [127, # ECBI, intensity 11, # ECBI, problem 60] # CBC t-score age_labels = ['school', 'pre-k', 'teen'] estimates = [] for i,measure in enumerate(unique_measures): measure_traces = traces[i] cutoff = thresholds[i] for j, trace in enumerate(measure_traces): x = np.random.choice(analysis_subset[analysis_subset.measure_instrument==measure].baseline_mean, 10000) c1, p1, m1 = trace.trace().T _child_only = ((x + c1) > cutoff).mean().round(2) _parent_only = ((x + p1) > cutoff).mean().round(2) _multi_component = ((x + m1) > cutoff).mean().round(2) if j: age_effect = alpha_age.trace()[:, j-1] else: age_effect = 0 _control = ((x + baseline.trace()[:, i] + age_effect) > cutoff).mean().round(2) print('\n{0}, {1}'.format(measure, age_labels[j])) print('Child-only: {0}\nParent-only: {1}\nMulti-component: {2}\nControl/TAU: {3}'.format(_child_only, _parent_only, _multi_component, _control)) estimates.append((measure, age_labels[j], _child_only, _parent_only, _multi_component, _control)) # In[115]: estimates # In[116]: estimates_filename = 'estimates%s.csv' % (bool(~pcit_as_multicomponent)*'_pcit') np.savetxt(estimates_filename, np.array([e[2:] for e in estimates]), delimiter='\t', fmt='%1.2f') # In[117]: pd.DataFrame(estimates, columns=['Instrument', 'Age group', 'Child-only', 'Parent-only', 'Multi-component', 'TAU/Control']).to_csv('outcome_proportions.csv') # Summary statistics for TAU/control # In[118]: baseline.summary() # Summary statistics for intensity subscale # In[119]: prek_intensity_pred.summary() school_intensity_pred.summary() teen_intensity_pred.summary() # Summary statistics for problem subscale # In[120]: prek_problem_pred.summary() school_problem_pred.summary() teen_problem_pred.summary() # Summary statistics for t scores # In[121]: prek_tscore_pred.summary() school_tscore_pred.summary() teen_tscore_pred.summary() # #### Summary statistics for differences. # # Each summary corresponds to teh differences between baseline and age x component categories for each outcome. The order is: # # - child-only, pre-k # - child-only, school # - child-only, teen # - parent-only, pre-k # - parent-only, school # - parent-only, teen # - multi, pre-k # - multi, school # - multi, teen # # In[122]: intensity_diff.summary() problem_diff.summary() tscore_diff.summary() # In[124]: M.best.summary()