All studies with successful outcomes reported for early-identified children who are deaf or hard of hearing have intervention provided by specialists who are trained in parent-infant intervention services.[12,90,97] Early intervention programs should develop mechanisms to ensure that early intervention professionals have special skills necessary for providing families with the highest quality of service specific to children with hearing loss. Professionals with a background in deaf education, audiology, and speech-language pathology will typically have the skills needed for providing intervention services. Professionals should be highly qualified in their respective fields and should be skilled communicators who are knowledgeable and sensitive to the importance of enhancing families' strengths and supporting their priorities. When early intervention professionals have knowledge of the principles of adult learning, it increases their success with parents and other professionals.
TJC must be competent to provide services.
References:
Purpose of the study is to examine the relationship between highly qualified providers, age of enrollment in intervention, and speech and language outcomes at 4 years of age in a group of children enrolled in school programs that specialize in the development of listening and spoken language.
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
Import data
lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (31,33,41) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
lsl_dr.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 0 initial_assessment_arm_1 2005-2006 0 0 0 0 0 1 initial_assessment_arm_1 2007-2008 0 0 0 0 2 2 initial_assessment_arm_1 NaN 0 NaN NaN NaN NaN 3 initial_assessment_arm_1 2014 0 NaN NaN NaN NaN 4 initial_assessment_arm_1 2008-2009 0 0 0 0 1 _mother_ed father_ed premature_age ... known_synd \ 0 6 6 8 ... 0 1 6 6 2 ... NaN 2 NaN NaN NaN ... 0 3 NaN NaN NaN ... 0 4 2 4 9 ... NaN synd_or_disab race age_test domain school score test_name \ 0 1 0 NaN NaN NaN NaN NaN 1 NaN 0 NaN NaN NaN NaN NaN 2 0 NaN NaN NaN NaN NaN NaN 3 0 NaN NaN NaN NaN NaN NaN 4 NaN 0 62 Articulation 522 40 NaN test_type academic_year_start 0 NaN 2005 1 NaN 2007 2 NaN NaN 3 NaN 2014 4 Goldman 2008 [5 rows x 73 columns]
Indicator for non-profound hearing loss
lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan
Indicator for first intervention outside OPTION
lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan
Indicator for high school graduation of mother
lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None
Joint commission benchmark
lsl_dr.onset_1.unique()
array([ 21. , 0. , nan, 25. , 1. , 3. , 12. , 24. , 7. , 46. , 9. , 2. , 15. , 17. , 36. , 18. , 28. , 10. , 48. , 29. , 23. , 30. , 6. , 4. , 20. , 19. , 11. , 8. , 14. , 31. , 16. , 27. , 33. , 22. , 35. , 5. , 13. , 26. , 60. , 32. , 44. , 40. , 1.5, 52. , 43. , 38. , 34. , 54. , 41. , 50. , 51. , 116. , 72. , 42. , 57. , 65. , 78. , 56. , 59. , 83. , 61. , 107. , 64. , 74. , 37. , 77. , 62. , 119. , 88. , 84. , 0.5, 66. , 80. , 58. , 53. , 47. , 81. , 39. , 63. , 140. , 45. , 67. , 49. , 2.5, 86. , 126. , 85. , 133. , 103. , 98. , 87. , 76. , 68. , 92. , 97. , 71. , 55. , 70. , 75. , 152. , 89. , 154. ])
lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None
Create age in years variable
lsl_dr['age_years'] = lsl_dr.age/12.
Retain only 4-year-olds
lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]
lsl_dr.shape
(5847, 80)
lsl_dr.study_id.unique().shape
(1369,)
unique_students = lsl_dr.drop_duplicates(subset='study_id')
unique_students.shape
(1369, 80)
unique_students.age.describe()
count 1351.00000 mean 29.73131 std 15.62684 min 1.00000 25% 18.00000 50% 32.00000 75% 40.50000 max 68.00000 Name: age, dtype: float64
unique_students.male.mean()
0.52126099706744866
unique_students.sib.describe()
count 1261.000000 mean 1.175258 std 0.930211 min 0.000000 25% 1.000000 50% 1.000000 75% 2.000000 max 3.000000 Name: sib, dtype: float64
(unique_students.degree_hl == 6).describe()
count 1369 mean 0.4704164 std 0.4993064 min False 25% 0 50% 0 75% 1 max True Name: degree_hl, dtype: object
((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()
count 1369 mean 0.3798393 std 0.485524 min False 25% 0 50% 0 75% 1 max True Name: degree_hl, dtype: object
unique_students.mother_hs.describe()
count 846.000000 mean 0.569740 std 0.495405 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: mother_hs, dtype: float64
unique_students.synd_or_disab.describe()
count 1207.000000 mean 0.218724 std 0.413552 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 Name: synd_or_disab, dtype: float64
grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()
for k, v in grouped_age:
v.hist(label=k, alpha=.75, ax=ax, grid=False)
ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');
# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='receptive') &
(lsl_dr.non_english==False)]
receptive_language_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 24 initial_assessment_arm_1 2007-2008 0 0 0 0 0 77 initial_assessment_arm_1 2010-2011 0 1 0 0 0 126 initial_assessment_arm_1 2011-2012 0 1 0 0 2 139 initial_assessment_arm_1 2011-2012 0 1 0 0 0 156 initial_assessment_arm_1 2011-2012 0 1 1 0 2 _mother_ed father_ed premature_age ... test_name test_type \ 24 6 6 8 ... PLS receptive 77 3 1 8 ... PLS receptive 126 6 6 8 ... CELF-P2 receptive 139 6 6 3 ... PLS receptive 156 0 6 8 ... PLS receptive academic_year_start deg_hl_below6 int_outside_option mother_hs \ 24 2007 0 1 NaN 77 2010 0 1 0 126 2011 0 1 NaN 139 2011 0 1 NaN 156 2011 0 1 0 ident_by_3 program_by_6 jcih age_years 24 False False 0 2.250000 77 False False 0 4.166667 126 True False 0 4.166667 139 False False 0 3.666667 156 True False 0 3.333333 [5 rows x 80 columns]
This is the size of the resulting dataset to be used in this analysis
receptive_language_dataset.shape
(901, 80)
mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()
receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')
receptive_language_dataset = receptive_language_dataset.set_index('study_id')
receptive_language_dataset['mean_score'] = mean_score
Specify criterion to use: jcih
, ident_by_3
, program_by_6
criterion_labels = {'jcih': 'JCIH',
'ident_by_3': 'Diagnosed by 3 months',
'program_by_6': 'Intervention by 6 months'}
criterion = 'jcih'
Proportion of missing values for each variable
covs = ['age_years', 'school', 'score', 'male', 'time',
'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)
age_years 0.01 school 0.00 score 0.00 male 0.00 time 0.06 family_inv 0.19 sib 0.07 deg_hl_below6 0.05 mother_hs 0.32 synd_or_disab 0.13 jcih 0.27 dtype: float64
Drop individuals with missing age
, since there is <1% of them
receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]
receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
receptive_language_dataset.mother_ed.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10d35d320>
Final analysis dataset size
receptive_language_dataset.shape
(823, 80)
missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct
mother_hs 32.1 jcih 27.0 family_inv 19.4 synd_or_disab 13.1 sib 6.3 time 5.7 deg_hl_below6 4.6 male 0.0 score 0.0 school 0.0 age_years 0.0 dtype: float64
receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')
by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
male synd_or_disab deg_hl_below6 mother_hs age_amp_missing False 0.511986 0.238748 0.483304 0.600985 True 0.514644 0.308824 0.458333 0.607843
for col in ['age', 'score', 'degree_hl', 'family_inv', 'age_int']:
try:
_ = by_age_amp_missing.boxplot(column=col)
except:
pass
/usr/local/lib/python3.4/site-packages/pandas/tools/plotting.py:2625: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'. warnings.warn(msg, FutureWarning)
receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')
by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
male synd_or_disab deg_hl_below6 mother_hs jcih_missing False 0.517471 0.237094 0.477002 0.610048 True 0.500000 0.317708 0.474747 0.581560
for col in ['age', 'score', 'degree_hl', 'family_inv', 'age_int']:
_ = by_jcih_missing.boxplot(column=col)
/usr/local/lib/python3.4/site-packages/pandas/tools/plotting.py:2625: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'. warnings.warn(msg, FutureWarning)
receptive_language_dataset.age_years.mean()
2.3378898339408671
from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values
n_covs = 9
def generate_model(dataset):
# Extract data to local variables
age_years = dataset.age_years.values
school = dataset.school.values
score = dataset.score.values
non_english = dataset.non_english.astype(int).values
male = dataset.male.values
sib = dataset.sib.values
synd_or_disab = dataset.synd_or_disab.values
int_outside_option = dataset.int_outside_option.values
synd_or_disab = dataset.synd_or_disab.values
study_id = dataset.index.values
ident_by_3 = dataset.ident_by_3
program_by_6 = dataset.program_by_6
# Transform some data
age_std = age_years - age_years.mean()
# Imputation of family involvement
n_family_inv_cats = len(dataset.family_inv.unique())
p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats), value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked, observed=True)
# Imputation of hearing loss
# p_crit = Beta("p_%s" % criterion, 1, 1, value=0.5)
# crit_masked = masked_values(dataset[criterion].values, value=np.nan)
# x_crit = Bernoulli('x_%s' % criterion, p_crit, value=crit_masked, observed=True)
# Imputation of hearing loss
p_hl = Beta("p_hl", 1, 1, value=0.5)
hl_masked = masked_values(dataset.deg_hl_below6.values, value=np.nan)
x_hl = Bernoulli('x_hl', p_hl, value=hl_masked, observed=True)
# Imputation of maternal education
p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
# Imputation of previous disability
p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked, observed=True)
# Imputation of siblings
n_sib_cats = len(dataset.sib.unique())
p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
# Imputation of time
# n_time_cats = len(dataset.time.unique())
# p_time = Dirichlet("p_time", np.ones(n_time_cats), value=[1./n_time_cats]*(n_time_cats-1))
# time_masked = masked_values(dataset.time.fillna(0.5), value=0.5)
# x_time = Categorical('x_time', p_time, value=time_masked, observed=True)
mean_score = Normal("mean_score", 100, 0.001, value=80)
# Indices to school random effects
unique_schools = np.unique(school)
school_index = [list(unique_schools).index(s) for s in school]
# School random effect
sigma_school = Uniform("sigma_school", 0, 1000, value=10)
tau_school = sigma_school**-2
alpha_school = Normal("alpha_school", mu=0, tau=tau_school, value=np.zeros(len(unique_schools)))
# Indices to child random effects
unique_child = np.unique(study_id)
child_index = [list(unique_child).index(s) for s in study_id]
# Robust individual random effect
nu_child = Exponential("nu_child", 0.1, value=10)
alpha_child = T("alpha_child", nu=nu_child, value=np.zeros(len(unique_child)))
@deterministic
def intercept(a0=mean_score, a1=alpha_school):
"""Random intercepts"""
return a0 + a1[school_index]
# Fixed effects
beta = Normal("beta", 0, 0.001, value=[0]*n_covs)
@deterministic
def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib,
x_hl=x_hl, x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab, a=alpha_child):
# Complete predictors
X = [age_std, male, ident_by_3, program_by_6]
# Partially-imputed predictors
X += [x_family_inv, x_sib, x_hl, x_mother_hs, x_synd_or_disab]
return b0 + np.dot(b, X) + a[child_index]
sigma = Uniform("sigma", 0, 1000, value=10)
tau = sigma**-2
# Data likelihood
score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
#score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
# Predictions:
# Girl, average age, ident by 3 prog by 6, normal family involvement,
# 2 siblings, non-profound hearing loss,
# mother with HS, no previous disability, JCIH
x1 = [0, 0, 1, 1, 0, 2, 1, 1, 0]
# Boy, 6 months younger than average, ident by 3 prog by 6, normal family involvement,
# ident by 3 prog by 6, no siblings, profound hearing loss,
# mother without HS diploma, previous disability
x2 = [-0.5, 1, 1, 1, 0, 0, 0, 0, 1]
# Boy, 1.5 years older than average, not ident by 3 or prog by 6, poor family involvement,
# 1 sibling, non-profound hearing loss,
# mother without HS diploma, previous disability
x3 = [1.5, 1, 0, 0, 2, 1, 1, 0, 1]
# Girl, 6 months older than average, ident by 3 prog by 6, impaired family involvement,
# 3 siblings, profound hearing loss,
# mother with HS diploma, no previous disability
x4 = [0.5, 1, 1, 1, 1, 3, 0, 1, 0]
X = np.c_[x1, x2, x3, x4]
theta_pred = Lambda('theta_pred', lambda b=beta, mu=mean_score: mu + np.dot(X.T, b))
predictions = Normal('predictions', mu=theta_pred, tau=tau)
return locals()
M_reclang = MCMC(generate_model(receptive_language_dataset),
db='sqlite', name='reclang', dbmode='w')
iterations = 100000
burn = 90000
M_reclang.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 732.0 sec
labels = ['Age of Enrollment', 'Male', 'Identified by 3 mo.',
'In Program by 6 mo.', 'Reduced Family Involvement',
'Sibling Count', 'Non-profound Hearing Loss',
'Mother with HS Diploma','Additional Disability', ]
# criterion_labels[criterion]]
Matplot.summary_plot([M_reclang.beta], rhat=False,
custom_labels=labels, x_range=(-10,10), main='Receptive Language')
Matplot.summary_plot([M_reclang.beta], rhat=False,
custom_labels=labels, x_range=(-10,10), main='Receptive Language')
M_reclang.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -3.572 0.631 0.045 [-4.74 -2.369] -0.187 1.215 0.091 [-2.634 2.285] 3.164 1.322 0.104 [ 0.57 5.665] -0.648 2.703 0.252 [-5.237 4.756] -5.278 0.586 0.036 [-6.401 -4.159] -0.436 0.628 0.037 [-1.64 0.795] 7.596 1.195 0.088 [ 5.329 9.906] 5.091 1.325 0.104 [ 2.395 7.535] -5.846 1.286 0.101 [-8.533 -3.454] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -4.738 -4.024 -3.57 -3.125 -2.317 -2.676 -0.978 -0.127 0.59 2.271 0.464 2.319 3.214 4.052 5.612 -5.503 -2.643 -0.755 1.232 4.599 -6.414 -5.68 -5.281 -4.875 -4.167 -1.635 -0.857 -0.468 0.005 0.822 5.369 6.757 7.566 8.435 9.991 2.43 4.228 5.126 5.948 7.609 -8.533 -6.648 -5.817 -5.0 -3.441
Matplot.summary_plot([M_reclang.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)])
expressive_language_dataset = lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='expressive') &
(lsl_dr.non_english==False)]
expressive_language_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 25 initial_assessment_arm_1 2007-2008 0 0 0 0 0 78 initial_assessment_arm_1 2010-2011 0 1 0 0 0 127 initial_assessment_arm_1 2011-2012 0 1 0 0 2 140 initial_assessment_arm_1 2011-2012 0 1 0 0 0 157 initial_assessment_arm_1 2011-2012 0 1 1 0 2 _mother_ed father_ed premature_age ... test_name test_type \ 25 6 6 8 ... PLS expressive 78 3 1 8 ... PLS expressive 127 6 6 8 ... CELF-P2 expressive 140 6 6 3 ... PLS expressive 157 0 6 8 ... PLS expressive academic_year_start deg_hl_below6 int_outside_option mother_hs \ 25 2007 0 1 NaN 78 2010 0 1 0 127 2011 0 1 NaN 140 2011 0 1 NaN 157 2011 0 1 0 ident_by_3 program_by_6 jcih age_years 25 False False 0 2.250000 78 False False 0 4.166667 127 True False 0 4.166667 140 False False 0 3.666667 157 True False 0 3.333333 [5 rows x 80 columns]
mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score
(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)
age_years 0.01 school 0.00 score 0.00 male 0.00 time 0.06 family_inv 0.19 sib 0.07 deg_hl_below6 0.05 mother_hs 0.32 synd_or_disab 0.14 jcih 0.27 dtype: float64
expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull()
& expressive_language_dataset.int_outside_option.notnull()]
expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')
M_explang.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 733.9 sec
M_explang.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -4.011 0.622 0.043 [-5.216 -2.829] -0.51 1.154 0.088 [-2.802 1.763] 4.319 1.44 0.117 [ 1.876 7.414] -1.337 2.391 0.22 [-5.54 3.443] -4.682 0.577 0.036 [-5.806 -3.566] -0.774 0.671 0.042 [-2.203 0.412] 8.344 1.297 0.102 [ 6.004 10.943] 6.726 1.453 0.121 [ 4.156 9.883] -6.165 1.268 0.098 [-8.719 -3.751] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -5.142 -4.449 -4.041 -3.589 -2.666 -2.91 -1.289 -0.462 0.268 1.721 1.332 3.385 4.271 5.303 7.144 -5.446 -3.096 -1.529 0.257 3.607 -5.79 -5.083 -4.696 -4.312 -3.496 -2.129 -1.222 -0.755 -0.309 0.506 6.029 7.371 8.28 9.269 11.006 3.971 5.717 6.671 7.642 9.825 -8.588 -7.028 -6.221 -5.338 -3.553
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-10,10))
Though these models were constructed primarily for inference, that is, to provide insights on the relative effects of a suite of factors on expected test scores, there is the potential for using them to predict speech-language outcomes. To explore this, we contrived four predictive scenarios, whereby children with different values of the set of predictor variables were specified, and predictions for them were generated by our models.
The Bayesian hierarchical modeling approach makes generating predictions straightforward, via the posterior predictive distribution, which generates predicted values of outcomes conditional on the outcomes observed in the dataset. The variation of these predictions includes both the residual uncertainty in the model parameters and the aleatory sampling variability of the test outcomes. Our test cases were specified as follows:
Case 1 👧
Case 2 👦
Case 3 👶
Case 4 🙎
M_reclang.predictions.summary()
predictions: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 101.062 16.35 0.276 [ 70.02 134.248] 84.974 16.468 0.315 [ 52.47 116.848] 71.965 16.203 0.221 [ 40.308 103.237] 85.678 16.497 0.312 [ 53.156 117.479] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 68.889 90.112 101.054 111.965 133.313 52.577 73.922 84.952 96.241 117.034 40.89 60.91 71.838 83.054 104.141 53.344 74.444 85.7 96.975 117.713
Matplot.summary_plot([M_reclang.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)])
M_explang.predictions.summary()
predictions: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 96.338 16.855 0.274 [ 62.645 128.481] 77.998 16.99 0.33 [ 43.675 110.062] 65.469 16.694 0.205 [ 33.268 97.793] 79.681 16.86 0.326 [ 46.457 112.541] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 63.171 84.967 96.431 107.714 129.437 44.753 66.441 77.97 89.619 111.348 33.318 53.8 65.582 76.824 97.859 46.426 68.564 79.686 91.087 112.511
articulation_dataset = lsl_dr[(lsl_dr.domain=='Articulation') &
(lsl_dr.non_english==False)]
articulation_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 15 initial_assessment_arm_1 2011-2012 1 1 0 0 NaN 21 initial_assessment_arm_1 2007-2008 0 0 0 0 0 26 initial_assessment_arm_1 2011-2012 0 0 0 0 2 123 initial_assessment_arm_1 2011-2012 0 1 0 0 2 136 initial_assessment_arm_1 2011-2012 0 1 0 0 0 _mother_ed father_ed premature_age ... test_name test_type \ 15 6 6 9 ... NaN Goldman 21 6 6 8 ... NaN Goldman 26 6 6 9 ... NaN Goldman 123 6 6 8 ... NaN Goldman 136 6 6 3 ... NaN Goldman academic_year_start deg_hl_below6 int_outside_option mother_hs \ 15 2011 NaN 0 NaN 21 2007 0 1 NaN 26 2011 1 1 NaN 123 2011 0 1 NaN 136 2011 0 1 NaN ident_by_3 program_by_6 jcih age_years 15 False False NaN 4.166667 21 False False 0 2.250000 26 True False 0 4.833333 123 True False 0 4.166667 136 False False 0 3.666667 [5 rows x 80 columns]
mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score
(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)
age_years 0.00 school 0.00 score 0.00 male 0.00 time 0.06 family_inv 0.22 sib 0.07 deg_hl_below6 0.13 mother_hs 0.36 synd_or_disab 0.12 jcih 0.36 dtype: float64
articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
& articulation_dataset.int_outside_option.notnull()]
articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')
M_articulation.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 722.3 sec
Matplot.summary_plot([M_articulation.beta],
custom_labels=labels, rhat=False, main='Articulation', x_range=(-10,10))
Matplot.summary_plot([M_articulation.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)])
M_articulation.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -3.511 0.666 0.042 [-4.892 -2.244] 0.903 1.399 0.107 [-1.989 3.376] 2.571 1.533 0.127 [-0.36 5.526] -3.006 2.355 0.208 [-7.835 0.979] -4.149 0.653 0.043 [-5.357 -2.845] -0.025 0.709 0.041 [-1.453 1.301] 6.457 1.514 0.125 [ 3.639 9.264] 5.163 1.88 0.163 [ 1.422 8.984] -7.967 1.365 0.103 [-10.579 -5.177] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -4.883 -3.939 -3.494 -3.045 -2.217 -1.891 0.005 0.947 1.851 3.56 -0.292 1.488 2.548 3.624 5.624 -7.558 -4.515 -2.966 -1.584 1.804 -5.366 -4.601 -4.18 -3.674 -2.846 -1.376 -0.482 -0.046 0.423 1.389 3.532 5.371 6.509 7.567 9.192 1.398 3.92 5.215 6.388 8.984 -10.856 -8.796 -7.883 -7.076 -5.407
expressive_vocab_dataset = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') &
(lsl_dr.non_english==False)]
expressive_vocab_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 16 initial_assessment_arm_1 2011-2012 1 1 0 0 NaN 22 initial_assessment_arm_1 2007-2008 0 0 0 0 0 27 initial_assessment_arm_1 2011-2012 0 0 0 0 2 51 initial_assessment_arm_1 2012-2013 0 0 0 0 2 75 initial_assessment_arm_1 2010-2011 0 1 0 0 0 _mother_ed father_ed premature_age ... test_name test_type \ 16 6 6 9 ... NaN EVT 22 6 6 8 ... NaN EOWPVT 27 6 6 9 ... NaN EVT 51 6 6 8 ... NaN EVT 75 3 1 8 ... NaN EVT academic_year_start deg_hl_below6 int_outside_option mother_hs \ 16 2011 NaN 0 NaN 22 2007 0 1 NaN 27 2011 1 1 NaN 51 2012 1 1 NaN 75 2010 0 1 0 ident_by_3 program_by_6 jcih age_years 16 False False NaN 4.166667 22 False False 0 2.250000 27 True False 0 4.833333 51 False False 0 4.000000 75 False False 0 4.166667 [5 rows x 80 columns]
mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score
expressive_vocab_dataset[covs].isnull().sum()
age_years 6 school 0 score 0 male 0 time 61 family_inv 219 sib 70 deg_hl_below6 123 mother_hs 351 synd_or_disab 115 jcih 344 dtype: int64
expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull()
& expressive_vocab_dataset.male.notnull()
& expressive_vocab_dataset.int_outside_option.notnull()]
expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')
M_expressive_vocab.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 774.0 sec
Matplot.summary_plot([M_expressive_vocab.beta],
custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))
M_expressive_vocab.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -3.351 0.557 0.033 [-4.512 -2.339] 1.915 1.106 0.081 [-0.095 4.035] 3.932 1.413 0.114 [ 1.381 6.62 ] -1.507 2.12 0.192 [-5.212 3.226] -5.021 0.626 0.042 [-6.091 -3.698] -1.189 0.646 0.038 [-2.531 -0.007] 7.019 1.187 0.089 [ 4.722 9.439] 6.978 1.432 0.118 [ 4.374 9.931] -5.563 1.333 0.103 [-8.11 -2.944] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -4.488 -3.726 -3.362 -2.975 -2.286 -0.177 1.162 1.868 2.664 4.025 1.263 2.919 3.919 4.954 6.551 -5.384 -2.985 -1.613 -0.231 3.167 -6.236 -5.475 -5.032 -4.566 -3.79 -2.474 -1.623 -1.192 -0.741 0.097 4.546 6.239 7.06 7.79 9.345 4.266 6.055 6.918 7.953 9.876 -8.304 -6.462 -5.52 -4.668 -3.062
Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects',
x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.summary_plot(M_expressive_vocab.alpha_school, custom_labels=[' ']*42, main='Expressive vocabulary school effects',
x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
receptive_vocab_dataset = lsl_dr[(lsl_dr.domain=='Receptive Vocabulary') &
(lsl_dr.non_english==False)]
receptive_vocab_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 17 initial_assessment_arm_1 2011-2012 1 1 0 0 NaN 23 initial_assessment_arm_1 2007-2008 0 0 0 0 0 28 initial_assessment_arm_1 2011-2012 0 0 0 0 2 52 initial_assessment_arm_1 2012-2013 0 0 0 0 2 76 initial_assessment_arm_1 2010-2011 0 1 0 0 0 _mother_ed father_ed premature_age ... test_name test_type \ 17 6 6 9 ... NaN PPVT 23 6 6 8 ... NaN PPVT 28 6 6 9 ... NaN PPVT 52 6 6 8 ... NaN PPVT 76 3 1 8 ... NaN PPVT academic_year_start deg_hl_below6 int_outside_option mother_hs \ 17 2011 NaN 0 NaN 23 2007 0 1 NaN 28 2011 1 1 NaN 52 2012 1 1 NaN 76 2010 0 1 0 ident_by_3 program_by_6 jcih age_years 17 False False NaN 4.166667 23 False False 0 2.250000 28 True False 0 4.833333 52 False False 0 4.000000 76 False False 0 4.166667 [5 rows x 80 columns]
mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score
receptive_vocab_dataset[covs].isnull().sum()
age_years 5 school 0 score 0 male 0 time 56 family_inv 224 sib 69 deg_hl_below6 126 mother_hs 357 synd_or_disab 115 jcih 347 dtype: int64
receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() &
receptive_vocab_dataset.male.notnull() &
receptive_vocab_dataset.int_outside_option.notnull()]
receptive_vocab_dataset.age_int.hist(bins=40)
<matplotlib.axes._subplots.AxesSubplot at 0x10e3917f0>
receptive_vocab_dataset.shape
(961, 80)
receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')
M_receptive_vocab.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 781.0 sec
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
custom_labels=labels, main='Receptive Vocabulary', x_range=(-10,10))
M_receptive_vocab.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -3.496 0.582 0.041 [-4.591 -2.317] 0.549 1.039 0.077 [-1.466 2.593] 2.375 1.271 0.103 [-0.181 4.735] -1.439 2.046 0.182 [-5.365 2.937] -4.668 0.567 0.04 [-5.715 -3.521] -1.288 0.641 0.04 [-2.64 -0.143] 6.2 1.18 0.096 [ 4.156 8.845] 5.286 1.279 0.104 [ 2.847 7.767] -6.816 1.149 0.088 [-8.899 -4.483] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -4.62 -3.9 -3.502 -3.103 -2.333 -1.442 -0.134 0.491 1.188 2.636 -0.042 1.51 2.391 3.226 4.948 -5.3 -2.77 -1.454 -0.21 3.009 -5.699 -5.073 -4.675 -4.299 -3.489 -2.619 -1.694 -1.276 -0.828 -0.103 3.92 5.394 6.179 6.977 8.723 2.909 4.431 5.277 6.134 7.852 -9.075 -7.636 -6.799 -5.97 -4.567
import sqlite3
db_files = !ls *sqlite
articulation = sqlite3.connect("articulation.sqlite")
c_artic = articulation.cursor()
c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')
<sqlite3.Cursor at 0x10cad7500>
sigma_school_artic = np.ravel(c_artic.fetchall())
sigma_school_artic.mean()
6.1150142553000002
for dbname in db_files:
db = sqlite3.connect(dbname)
cur = db.cursor()
cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
trace = np.ravel(cur.fetchall())
trace.sort()
print('\n'+dbname)
print(np.median(trace))
print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])
articulation.sqlite 6.101879 [ 3.601835 8.942658] explang.sqlite 5.363171 [ 3.465128 7.299587] expressive_vocab.sqlite 8.680978 [ 6.714001 11.222219] receptive_vocab.sqlite 6.7558135 [ 5.057159 8.978382] reclang.sqlite 5.214563 [ 3.906563 7.246824]