# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
Import data
lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (31,33,41) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
lsl_dr.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 0 initial_assessment_arm_1 2009-2010 0 0 0 0 0 1 initial_assessment_arm_1 2010-2011 0 0 7 0 2 2 initial_assessment_arm_1 2010-2011 0 0 7 0 2 3 initial_assessment_arm_1 2009-2010 0 1 7 0 1 4 initial_assessment_arm_1 2005-2006 0 1 0 0 3 _mother_ed father_ed premature_age ... known_synd \ 0 5 4 9 ... 0 1 6 6 7 ... 0 2 6 6 7 ... 1 3 6 6 8 ... 0 4 2 2 8 ... 0 synd_or_disab race age_test domain school score test_name \ 0 1 0 NaN NaN NaN NaN NaN 1 0 NaN NaN NaN NaN NaN NaN 2 1 NaN NaN NaN NaN NaN NaN 3 0 NaN NaN NaN NaN NaN NaN 4 NaN 0 48 Articulation 521 41 NaN test_type academic_year_start 0 NaN 2009 1 NaN 2010 2 NaN 2010 3 NaN 2009 4 Goldman 2005 [5 rows x 74 columns]
Indicator for non-profound hearing loss
lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan
Indicator for first intervention outside OPTION
lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan
Indicator for high school graduation of mother
lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None
Joint commission benchmark
lsl_dr.onset_1.unique()
array([ 1. , 18. , 17. , 22. , nan, 3. , 2. , 5. , 13. , 36. , 28. , 21. , 41. , 26. , 0. , 67. , 20. , 24. , 4. , 40. , 60. , 10. , 6. , 25. , 7. , 27. , 15. , 35. , 14. , 42. , 34. , 12. , 9. , 32. , 50. , 8. , 33. , 23. , 11. , 31. , 30. , 49. , 48. , 1.5, 19. , 2.5, 39. , 52. , 16. , 38. , 29. , 51. , 46. , 45. , 54. , 88. , 65. , 44. , 81. , 116. , 72. , 57. , 62. , 43. , 78. , 83. , 61. , 107. , 64. , 74. , 37. , 77. , 96. , 97. , 79. , 47. , 53. , 59. , 84. , 95. , 80. , 0.5, 58. , 56. , 86. , 98. , 85. , 75. , 119. , 66. , 70. , 63. , 140. , 126. , 133. , 103. , 87. , 76. , 55. , 68. , 92. , 71. , 154. , 89. , 152. ])
lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None
Create age in years variable
lsl_dr['age_years'] = lsl_dr.age/12.
Retain only 4-year-olds
lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]
lsl_dr.shape
(6873, 81)
lsl_dr.study_id.unique().shape
(1613,)
unique_students = lsl_dr.drop_duplicates(subset='study_id')
unique_students.shape
(1613, 81)
unique_students.age.describe()
count 1593.000000 mean 29.700565 std 15.676539 min 1.000000 25% 18.000000 50% 32.000000 75% 41.000000 max 68.000000 Name: age, dtype: float64
unique_students.male.mean()
0.5236612702366127
unique_students.male.value_counts()
1 841 0 765 dtype: int64
unique_students.sib.describe()
count 1501.000000 mean 1.161226 std 0.928434 min 0.000000 25% 0.000000 50% 1.000000 75% 2.000000 max 3.000000 Name: sib, dtype: float64
(unique_students.degree_hl == 6).describe()
count 1613 mean 0.4575325 std 0.4983478 min False 25% 0 50% 0 75% 1 max True Name: degree_hl, dtype: object
unique_students.deg_hl_below6.isnull().sum()
220
unique_students.deg_hl_below6.value_counts()
0 738 1 655 dtype: int64
((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()
count 1613 mean 0.3787973 std 0.485238 min False 25% 0 50% 0 75% 1 max True Name: degree_hl, dtype: object
unique_students.mother_hs.describe()
count 1006.000000 mean 0.580517 std 0.493720 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: mother_hs, dtype: float64
unique_students.synd_or_disab.describe()
count 1413.000000 mean 0.213022 std 0.409588 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 Name: synd_or_disab, dtype: float64
unique_students.groupby('male').program_by_6.value_counts()
male 0 False 670 True 95 1 False 750 True 91 dtype: int64
unique_students.groupby('sib').program_by_6.value_counts()
sib 0 False 346 True 38 1 False 562 True 90 2 False 267 True 37 3 False 145 True 16 dtype: int64
unique_students.groupby('deg_hl_below6').program_by_6.value_counts()
deg_hl_below6 0 False 638 True 100 1 False 574 True 81 dtype: int64
unique_students.groupby('mother_hs').program_by_6.value_counts()
mother_hs 0 False 367 True 55 1 False 489 True 95 dtype: int64
unique_students.groupby('synd_or_disab').program_by_6.value_counts()
synd_or_disab 0 False 976 True 136 1 False 273 True 28 dtype: int64
unique_students.groupby('ident_by_3').program_by_6.value_counts()
ident_by_3 False False 1013 True 34 True False 414 True 152 dtype: int64
unique_students.groupby('jcih').program_by_6.value_counts()
jcih 0 False 928 True 17 1 True 152 dtype: int64
grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()
for k, v in grouped_age:
v.hist(label=k, alpha=.75, ax=ax, grid=False)
ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');
students_data = lsl_dr.copy()
students_data['school_id'] = students_data.school.replace({int(s):i for i,s in enumerate(students_data.school.unique())})
students_data = students_data[students_data.domain=='Language']
students_data.shape
(2594, 82)
option_scores = students_data.drop_duplicates('study_id', take_last=True)[['score', 'school_id',
'deg_hl_below6', 'mother_hs']].dropna()
option_scores = option_scores.astype(int)
option_scores.to_csv('option_scores.csv', index=False)
Create technology indicators
unimodal = ~lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal']].sum(1).astype(bool)
lsl_dr['unimodal_ci'] = unimodal & lsl_dr.cochlear
lsl_dr['unimodal_ha'] = unimodal & lsl_dr.hearing_aid
lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(0)
bilateral_ci 2136 bilateral_ha 2261 bimodal 791 unimodal_ci 311 unimodal_ha 320 dtype: int64
Since bilateral hearing aid is the most prevalent category, it will be used as the baseline category.
tech_index = lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(1).astype(bool)
technology_subset = lsl_dr[tech_index]
technology_subset.score.max()
146.0
((unique_students.cochlear>0) & (unique_students.age_amp.notnull())).sum()
572
# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = technology_subset[(technology_subset.domain=='Language')
& (technology_subset.test_type=='receptive')
& (technology_subset.non_english==False)]
receptive_language_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 6 initial_assessment_arm_1 2005-2006 0 1 0 0 3 250 initial_assessment_arm_1 2012-2013 0 1 0 0 2 272 initial_assessment_arm_1 2014-2015 0 0 2 0 2 293 initial_assessment_arm_1 2014-2015 0 0 0 0 1 302 initial_assessment_arm_1 2011-2012 0 1 0 0 2 _mother_ed father_ed premature_age ... academic_year_start \ 6 2 2 8 ... 2005 250 4 6 7 ... 2012 272 6 6 8 ... 2014 293 5 6 7 ... 2014 302 6 6 8 ... 2011 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 6 0 1 0 False False 250 1 1 1 False False 272 1 1 NaN False False 293 1 1 1 False False 302 0 1 NaN False False jcih age_years unimodal_ci unimodal_ha 6 0 4.500000 False False 250 0 4.666667 True False 272 0 3.583333 False False 293 0 4.083333 False False 302 0 4.416667 False False [5 rows x 83 columns]
This is the size of the resulting dataset to be used in this analysis
receptive_language_dataset.shape
(941, 83)
mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()
receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')
receptive_language_dataset = receptive_language_dataset.set_index('study_id')
receptive_language_dataset['mean_score'] = mean_score
Specify criterion to use: jcih
, ident_by_3
, program_by_6
criterion_labels = {'jcih': 'JCIH',
'ident_by_3': 'Diagnosed by 3 months',
'program_by_6': 'Intervention by 6 months'}
criterion = 'jcih'
Proportion of missing values for each variable
covs = ['age_years', 'school', 'score', 'male', 'time',
'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)
age_years 0.00 school 0.00 score 0.00 male 0.00 time 0.04 family_inv 0.16 sib 0.06 deg_hl_below6 0.01 mother_hs 0.33 synd_or_disab 0.14 jcih 0.20 dtype: float64
Drop individuals with missing age
, since there is <1% of them
receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]
receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
receptive_language_dataset.mother_ed.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10c0d9ac8>
Final analysis dataset size
receptive_language_dataset.shape
(871, 83)
missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct
mother_hs 33.3 jcih 19.3 family_inv 15.8 synd_or_disab 14.0 sib 5.5 time 3.6 deg_hl_below6 1.3 male 0.0 score 0.0 school 0.0 age_years 0.0 dtype: float64
receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')
by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
male synd_or_disab deg_hl_below6 mother_hs age_amp_missing False 0.510885 0.221106 0.469298 0.616034 True 0.478022 0.250000 0.386364 0.654206
receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')
by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
male synd_or_disab deg_hl_below6 mother_hs jcih_missing False 0.513514 0.219110 0.467811 0.626033 True 0.464286 0.260563 0.385093 0.608247
receptive_language_dataset.age_years.mean()
2.2484691924990403
from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values
n_covs = 11
def generate_model(dataset):
# Extract data to local variables
age_years = dataset.age_years.values
school = dataset.school.values
score = dataset.score.values
non_english = dataset.non_english.astype(int).values
male = dataset.male.values
sib = dataset.sib.values
synd_or_disab = dataset.synd_or_disab.values
int_outside_option = dataset.int_outside_option.values
synd_or_disab = dataset.synd_or_disab.values
study_id = dataset.index.values
ident_by_3 = dataset.ident_by_3
program_by_6 = dataset.program_by_6
bilateral_ci = dataset.bilateral_ci
bimodal = dataset.bimodal
unimodal_ci = dataset.unimodal_ci
unimodal_ha = dataset.unimodal_ha
# Transform some data
age_std = age_years - age_years.mean()
# Imputation of family involvement
n_family_inv_cats = len(dataset.family_inv.unique())
p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats),
value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked,
observed=True)
# Imputation of maternal education
p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
# Imputation of previous disability
p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked,
observed=True)
# Imputation of siblings
n_sib_cats = len(dataset.sib.unique())
p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
mean_score = Normal("mean_score", 100, 0.001, value=80)
# Indices to school random effects
unique_schools = np.unique(school)
school_index = [list(unique_schools).index(s) for s in school]
# School random effect
sigma_school = Uniform("sigma_school", 0, 1000, value=10)
tau_school = sigma_school**-2
alpha_school = Normal("alpha_school", mu=0, tau=tau_school,
value=np.zeros(len(unique_schools)))
@deterministic
def intercept(a0=mean_score, a1=alpha_school):
"""Random intercepts"""
return a0 + a1[school_index]
# Fixed effects
beta = Normal("beta", 0, 0.001, value=[0]*n_covs)
@deterministic
def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib,
x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab):
# Complete predictors
X = [male, ident_by_3, program_by_6, bilateral_ci, bimodal, unimodal_ci, unimodal_ha]
# Partially-imputed predictors
X += [x_family_inv, x_sib, x_mother_hs, x_synd_or_disab]
return b0 + np.dot(b, X) #+ a[child_index]
sigma = Uniform("sigma", 0, 1000, value=10)
tau = sigma**-2
# Data likelihood
score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
#score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
return locals()
M_reclang = MCMC(generate_model(receptive_language_dataset),
db='sqlite', name='reclang', dbmode='w')
iterations = 100000
burn = 90000
M_reclang.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1195.6 sec
labels = ['Male', 'Identified by 3 mo.',
'In Program by 6 mo.',
'Bilateral CI', 'Bimodal', 'Unimodal CI', 'Unimodal HA',
'Reduced Family Involvement',
'Sibling Count', 'Non-profound Hearing Loss',
'Mother with HS Diploma','Additional Disability']
# criterion_labels[criterion]]
Matplot.summary_plot([M_reclang.beta], rhat=False,
custom_labels=labels, x_range=(-15,15), main='Receptive Language')
M_reclang.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -0.014 1.153 0.077 [-2.202 2.113] 4.213 1.211 0.087 [ 1.945 6.66 ] 7.079 1.719 0.141 [ 3.793 10.512] -8.003 1.359 0.109 [-10.81 -5.494] -6.393 2.042 0.179 [-10.309 -2.882] -14.416 3.097 0.286 [-19.959 -8.341] 5.658 2.812 0.26 [ 0.512 11.589] -5.67 0.538 0.03 [-6.697 -4.584] -0.513 0.613 0.033 [-1.719 0.659] 5.077 1.368 0.109 [ 2.226 7.522] -6.634 1.231 0.083 [-9.046 -4.23 ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -2.172 -0.861 -0.023 0.836 2.18 2.023 3.355 4.142 5.015 6.779 3.7 5.866 7.092 8.216 10.464 -10.652 -8.854 -7.942 -7.134 -5.244 -9.999 -7.739 -6.349 -5.291 -2.133 -20.392 -16.583 -14.364 -12.336 -8.494 0.461 3.575 5.635 7.481 11.589 -6.756 -6.031 -5.669 -5.296 -4.631 -1.702 -0.914 -0.513 -0.102 0.726 2.454 4.112 5.03 5.943 7.884 -9.211 -7.44 -6.597 -5.8 -4.343
expressive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='expressive') &
(technology_subset.non_english==False)]
expressive_language_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 7 initial_assessment_arm_1 2005-2006 0 1 0 0 3 251 initial_assessment_arm_1 2012-2013 0 1 0 0 2 273 initial_assessment_arm_1 2014-2015 0 0 2 0 2 294 initial_assessment_arm_1 2014-2015 0 0 0 0 1 303 initial_assessment_arm_1 2011-2012 0 1 0 0 2 _mother_ed father_ed premature_age ... academic_year_start \ 7 2 2 8 ... 2005 251 4 6 7 ... 2012 273 6 6 8 ... 2014 294 5 6 7 ... 2014 303 6 6 8 ... 2011 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 7 0 1 0 False False 251 1 1 1 False False 273 1 1 NaN False False 294 1 1 1 False False 303 0 1 NaN False False jcih age_years unimodal_ci unimodal_ha 7 0 4.500000 False False 251 0 4.666667 True False 273 0 3.583333 False False 294 0 4.083333 False False 303 0 4.416667 False False [5 rows x 83 columns]
mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score
(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)
age_years 0.00 school 0.00 score 0.00 male 0.00 time 0.04 family_inv 0.16 sib 0.06 deg_hl_below6 0.01 mother_hs 0.34 synd_or_disab 0.15 jcih 0.20 dtype: float64
expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull()
& expressive_language_dataset.int_outside_option.notnull()]
expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')
M_explang.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1161.6 sec
M_explang.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -0.608 1.162 0.079 [-2.866 1.539] 6.014 1.234 0.094 [ 3.551 8.321] 7.239 1.917 0.161 [ 2.9 10.575] -10.202 1.172 0.081 [-12.343 -7.766] -7.481 1.778 0.148 [-10.897 -4.018] -15.643 2.393 0.214 [-20.17 -11.248] 4.27 2.725 0.252 [-1.022 9.669] -4.916 0.596 0.035 [-6.123 -3.857] -0.634 0.643 0.036 [-1.86 0.603] 6.517 1.323 0.102 [ 4.018 9.17 ] -6.32 1.251 0.088 [-8.696 -3.707] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -2.76 -1.45 -0.615 0.184 1.713 3.51 5.197 6.054 6.853 8.295 3.238 5.981 7.282 8.577 10.96 -12.577 -11.039 -10.232 -9.396 -7.899 -10.895 -8.753 -7.463 -6.288 -3.999 -20.448 -17.373 -15.479 -13.926 -11.352 -0.93 2.321 4.061 6.167 9.799 -6.09 -5.328 -4.932 -4.505 -3.77 -1.845 -1.078 -0.645 -0.184 0.652 4.082 5.683 6.403 7.402 9.273 -8.829 -7.166 -6.327 -5.518 -3.79
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-15,15))
articulation_dataset = technology_subset[(technology_subset.domain=='Articulation') &
(technology_subset.non_english==False)]
articulation_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 4 initial_assessment_arm_1 2005-2006 0 1 0 0 3 269 initial_assessment_arm_1 2014-2015 0 0 2 0 2 290 initial_assessment_arm_1 2014-2015 0 0 0 0 1 371 initial_assessment_arm_1 2014-2015 0 0 0 0 1 504 initial_assessment_arm_1 2014-2015 0 1 0 0 0 _mother_ed father_ed premature_age ... academic_year_start \ 4 2 2 8 ... 2005 269 6 6 8 ... 2014 290 5 6 7 ... 2014 371 6 6 8 ... 2014 504 4 4 8 ... 2014 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 4 0 1 0 False False 269 1 1 NaN False False 290 1 1 1 False False 371 0 1 NaN True False 504 1 0 1 False False jcih age_years unimodal_ci unimodal_ha 4 0 4.500000 False False 269 0 3.583333 False False 290 0 4.083333 False False 371 0 3.916667 False False 504 0 4.500000 False False [5 rows x 83 columns]
mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score
(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)
age_years 0.00 school 0.00 score 0.00 male 0.00 time 0.04 family_inv 0.17 sib 0.05 deg_hl_below6 0.01 mother_hs 0.32 synd_or_disab 0.14 jcih 0.21 dtype: float64
articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
& articulation_dataset.int_outside_option.notnull()]
articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')
M_articulation.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1115.2 sec
Matplot.summary_plot([M_articulation.beta],
custom_labels=labels, rhat=False, main='Articulation', x_range=(-15,15))
M_articulation.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 1.025 1.393 0.099 [-1.822 3.482] 6.268 1.601 0.125 [ 3.039 9.445] 4.507 2.406 0.211 [-0.181 9.25 ] -5.6 1.654 0.132 [-8.819 -2.345] -3.846 2.327 0.205 [-8.176 0.582] -7.661 3.711 0.349 [-15.473 -1.608] 3.64 2.489 0.217 [-1.082 9.486] -3.331 0.614 0.034 [-4.58 -2.136] 0.007 0.714 0.039 [-1.352 1.393] 5.354 1.584 0.124 [ 2.196 8.587] -7.866 1.433 0.103 [-10.788 -5.272] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -1.798 0.111 1.118 1.982 3.537 2.927 5.227 6.344 7.344 9.399 -0.165 2.872 4.555 6.079 9.415 -8.918 -6.669 -5.574 -4.553 -2.411 -8.119 -5.561 -3.958 -2.179 0.696 -15.325 -10.014 -7.205 -4.965 -1.325 -1.852 2.085 3.707 5.221 8.908 -4.549 -3.748 -3.339 -2.93 -2.1 -1.391 -0.483 0.023 0.495 1.365 2.12 4.372 5.339 6.402 8.558 -10.712 -8.899 -7.858 -6.919 -5.118
expressive_vocab_dataset = technology_subset[(technology_subset.domain=='Expressive Vocabulary') &
(technology_subset.non_english==False)]
expressive_vocab_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 5 initial_assessment_arm_1 2005-2006 0 1 0 0 3 19 initial_assessment_arm_1 2007-2008 0 0 0 0 3 246 initial_assessment_arm_1 2012-2013 0 0 0 0 2 270 initial_assessment_arm_1 2014-2015 0 0 2 0 2 291 initial_assessment_arm_1 2014-2015 0 0 0 0 1 _mother_ed father_ed premature_age ... academic_year_start \ 5 2 2 8 ... 2005 19 4 4 8 ... 2007 246 6 6 8 ... 2012 270 6 6 8 ... 2014 291 5 6 7 ... 2014 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 5 0 1 0 False False 19 0 1 1 False False 246 1 1 NaN False False 270 1 1 NaN False False 291 1 1 1 False False jcih age_years unimodal_ci unimodal_ha 5 0 4.500000 False False 19 0 4.583333 False False 246 0 4.000000 False False 270 0 3.583333 False False 291 0 4.083333 False False [5 rows x 83 columns]
mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score
expressive_vocab_dataset[covs].isnull().sum()
age_years 4 school 0 score 0 male 0 time 40 family_inv 153 sib 49 deg_hl_below6 8 mother_hs 312 synd_or_disab 124 jcih 200 dtype: int64
expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull()
& expressive_vocab_dataset.male.notnull()
& expressive_vocab_dataset.int_outside_option.notnull()]
expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')
M_expressive_vocab.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1208.7 sec
Matplot.summary_plot([M_expressive_vocab.beta],
custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-15,15))
Matplot.summary_plot([M_expressive_vocab.beta],
custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))
M_expressive_vocab.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.975 1.16 0.08 [-1.414 3.056] 6.812 1.28 0.098 [ 4.052 9.213] 5.863 2.085 0.177 [ 1.974 10.077] -5.692 1.275 0.094 [-8.147 -3.308] -5.099 1.806 0.148 [-8.481 -1.372] -15.035 3.036 0.282 [-21.84 -9.861] 6.66 2.365 0.209 [ 2.757 11.604] -5.144 0.532 0.029 [-6.177 -4.129] -1.395 0.684 0.045 [-2.696 -0.007] 6.207 1.391 0.111 [ 3.567 8.881] -5.2 1.222 0.093 [-7.604 -2.894] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -1.268 0.213 0.98 1.75 3.277 4.152 6.004 6.826 7.654 9.359 1.191 4.751 5.903 7.196 9.834 -8.101 -6.568 -5.659 -4.84 -3.217 -8.628 -6.282 -5.102 -3.921 -1.378 -21.495 -16.902 -14.941 -12.865 -9.248 2.254 5.042 6.594 8.32 11.314 -6.209 -5.481 -5.156 -4.793 -4.141 -2.719 -1.868 -1.406 -0.943 -0.013 3.658 5.193 6.143 7.181 9.14 -7.714 -6.039 -5.138 -4.33 -2.985
Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects',
x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
receptive_vocab_dataset = technology_subset[(technology_subset.domain=='Receptive Vocabulary') &
(technology_subset.non_english==False)]
receptive_vocab_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 20 initial_assessment_arm_1 2007-2008 0 0 0 0 3 82 initial_assessment_arm_1 2009-2010 0 0 0 0 2 247 initial_assessment_arm_1 2012-2013 0 0 0 0 2 271 initial_assessment_arm_1 2014-2015 0 0 2 0 2 292 initial_assessment_arm_1 2014-2015 0 0 0 0 1 _mother_ed father_ed premature_age ... academic_year_start \ 20 4 4 8 ... 2007 82 4 4 8 ... 2009 247 6 6 8 ... 2012 271 6 6 8 ... 2014 292 5 6 7 ... 2014 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 20 0 1 1 False False 82 0 1 1 False False 247 1 1 NaN False False 271 1 1 NaN False False 292 1 1 1 False False jcih age_years unimodal_ci unimodal_ha 20 0 4.583333 False False 82 0 3.833333 False False 247 0 4.000000 False False 271 0 3.583333 False False 292 0 4.083333 False False [5 rows x 83 columns]
mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score
receptive_vocab_dataset[covs].isnull().sum()
age_years 4 school 0 score 0 male 0 time 37 family_inv 158 sib 49 deg_hl_below6 10 mother_hs 318 synd_or_disab 125 jcih 202 dtype: int64
receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() &
receptive_vocab_dataset.male.notnull() &
receptive_vocab_dataset.int_outside_option.notnull()]
receptive_vocab_dataset.age_int.hist(bins=40)
<matplotlib.axes._subplots.AxesSubplot at 0x108826da0>
receptive_vocab_dataset.shape
(922, 83)
receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')
M_receptive_vocab.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1169.1 sec
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
custom_labels=labels, main='Receptive Vocabulary', x_range=(-15,15))
M_receptive_vocab.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.385 1.001 0.06 [-1.625 2.257] 4.984 1.173 0.087 [ 2.65 7.07] 5.367 1.732 0.145 [ 1.841 8.586] -5.109 1.221 0.095 [-7.366 -2.551] -6.193 1.888 0.166 [-9.57 -2.332] -14.416 2.514 0.232 [-18.62 -9.371] 6.853 2.164 0.194 [ 2.9 11.565] -4.353 0.525 0.031 [-5.261 -3.238] -1.132 0.564 0.031 [-2.225 -0.051] 5.11 1.187 0.088 [ 2.992 7.523] -5.47 1.163 0.086 [-7.699 -2.975] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -1.611 -0.278 0.38 1.033 2.308 2.806 4.194 4.921 5.804 7.326 1.725 4.307 5.374 6.584 8.488 -7.774 -5.886 -5.104 -4.253 -2.817 -9.926 -7.507 -6.054 -4.984 -2.559 -18.693 -16.396 -14.435 -12.781 -9.379 2.521 5.466 6.854 8.135 11.346 -5.348 -4.725 -4.348 -4.007 -3.295 -2.252 -1.522 -1.125 -0.751 -0.052 2.763 4.226 5.077 5.989 7.354 -7.777 -6.235 -5.483 -4.715 -3.03
import sqlite3
db_files = !ls *sqlite
articulation = sqlite3.connect("articulation.sqlite")
c_artic = articulation.cursor()
c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')
<sqlite3.Cursor at 0x11a8de6c0>
sigma_school_artic = np.ravel(c_artic.fetchall())
sigma_school_artic.mean()
5.7923510709000006
for dbname in db_files:
db = sqlite3.connect(dbname)
cur = db.cursor()
cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
trace = np.ravel(cur.fetchall())
trace.sort()
print('\n'+dbname)
print(np.median(trace))
print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])
articulation.sqlite 5.739025 [ 3.090618 8.920649] explang.sqlite 4.945318 [ 3.390639 7.225201] expressive_vocab.sqlite 8.008659 [ 6.058134 10.607475] receptive_vocab.sqlite 6.532987 [ 4.738706 9.103903] reclang.sqlite 5.1217285 [ 3.251533 7.165217]