# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
Import data
lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (31,33,41,74) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
lsl_dr.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 0 initial_assessment_arm_1 2005-2006 0 1 0 0 3 1 initial_assessment_arm_1 2005-2006 0 1 0 0 3 2 initial_assessment_arm_1 2005-2006 0 1 0 0 3 3 initial_assessment_arm_1 2005-2006 0 1 0 0 3 4 initial_assessment_arm_1 2010-2011 0 0 7 0 2 _mother_ed father_ed premature_age ... known_synd \ 0 2 2 8 ... 0 1 2 2 8 ... 0 2 2 2 8 ... 0 3 2 2 8 ... 0 4 6 6 7 ... 0 synd_or_disab race age_test domain school score \ 0 NaN 0 48 Articulation 521 41 1 NaN 0 48 Expressive Vocabulary 521 71 2 NaN 0 48 Language 521 56 3 NaN 0 48 Language 521 55 4 0 NaN NaN NaN NaN NaN test_name test_type academic_year_start 0 NaN Goldman 2005 1 NaN EVT 2005 2 PLS receptive 2005 3 PLS expressive 2005 4 NaN NaN 2010 [5 rows x 74 columns]
Indicator for non-profound hearing loss
lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan
Indicator for first intervention outside OPTION
lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan
Indicator for high school graduation of mother
lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None
Joint commission benchmark
lsl_dr.onset_1.unique()
array([ 22. , 18. , 17. , 3. , 5. , nan, 2. , 1. , 13. , 10. , 36. , 28. , 26. , 21. , 41. , 0. , 6. , 25. , 7. , 35. , 34. , 27. , 15. , 24. , 4. , 14. , 40. , 42. , 12. , 9. , 50. , 33. , 8. , 23. , 32. , 60. , 31. , 11. , 30. , 20. , 67. , 49. , 48. , 1.5, 19. , 2.5, 39. , 52. , 16. , 38. , 29. , 51. , 46. , 45. , 54. , 44. , 88. , 65. , 81. , 116. , 72. , 57. , 62. , 43. , 78. , 83. , 61. , 107. , 64. , 74. , 37. , 77. , 96. , 97. , 79. , 47. , 53. , 59. , 84. , 95. , 80. , 0.5, 58. , 56. , 86. , 98. , 85. , 75. , 119. , 66. , 70. , 63. , 140. , 126. , 133. , 103. , 87. , 76. , 55. , 68. , 92. , 71. , 154. , 89. , 152. ])
lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None
Create age in years variable
lsl_dr['age_years'] = lsl_dr.age/12.
Retain only 4-year-olds
lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]
lsl_dr.shape
(6873, 81)
lsl_dr.study_id.unique().shape
(1613,)
unique_students = lsl_dr.drop_duplicates(subset='study_id')
unique_students.shape
(1613, 81)
unique_students.age.describe()
count 1593.000000 mean 29.700565 std 15.676539 min 1.000000 25% 18.000000 50% 32.000000 75% 41.000000 max 68.000000 Name: age, dtype: float64
unique_students.male.mean()
0.5236612702366127
unique_students.male.value_counts()
1 841 0 765 dtype: int64
unique_students.sib.describe()
count 1501.000000 mean 1.161226 std 0.928434 min 0.000000 25% 0.000000 50% 1.000000 75% 2.000000 max 3.000000 Name: sib, dtype: float64
(unique_students.degree_hl == 6).describe()
count 1613 mean 0.4575325 std 0.4983478 min False 25% 0 50% 0 75% 1 max True Name: degree_hl, dtype: object
unique_students.deg_hl_below6.isnull().sum()
220
unique_students.deg_hl_below6.value_counts()
0 738 1 655 dtype: int64
((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()
count 1613 mean 0.3787973 std 0.485238 min False 25% 0 50% 0 75% 1 max True Name: degree_hl, dtype: object
unique_students.mother_hs.describe()
count 1006.000000 mean 0.580517 std 0.493720 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: mother_hs, dtype: float64
unique_students.synd_or_disab.describe()
count 1413.000000 mean 0.213022 std 0.409588 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 Name: synd_or_disab, dtype: float64
unique_students.groupby('male').program_by_6.value_counts()
male 0 False 670 True 95 1 False 750 True 91 dtype: int64
unique_students.groupby('sib').program_by_6.value_counts()
sib 0 False 346 True 38 1 False 562 True 90 2 False 267 True 37 3 False 145 True 16 dtype: int64
unique_students.groupby('deg_hl_below6').program_by_6.value_counts()
deg_hl_below6 0 False 638 True 100 1 False 574 True 81 dtype: int64
unique_students.groupby('mother_hs').program_by_6.value_counts()
mother_hs 0 False 367 True 55 1 False 489 True 95 dtype: int64
unique_students.groupby('synd_or_disab').program_by_6.value_counts()
synd_or_disab 0 False 976 True 136 1 False 273 True 28 dtype: int64
unique_students.groupby('ident_by_3').program_by_6.value_counts()
ident_by_3 False False 1013 True 34 True False 414 True 152 dtype: int64
unique_students.groupby('jcih').program_by_6.value_counts()
jcih 0 False 928 True 17 1 True 152 dtype: int64
grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()
for k, v in grouped_age:
v.hist(label=k, alpha=.75, ax=ax, grid=False)
ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');
students_data = lsl_dr.copy()
students_data['school_id'] = students_data.school.replace({int(s):i for i,s in enumerate(students_data.school.unique())})
students_data = students_data[students_data.domain=='Language']
students_data.shape
(2594, 82)
option_scores = students_data.drop_duplicates('study_id', take_last=True)[['score', 'school_id',
'deg_hl_below6', 'mother_hs']].dropna()
option_scores = option_scores.astype(int)
option_scores.to_csv('option_scores.csv', index=False)
Create technology indicators
unimodal = ~lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal']].sum(1).astype(bool)
lsl_dr['unimodal_ci'] = unimodal & lsl_dr.cochlear
lsl_dr['unimodal_ha'] = unimodal & lsl_dr.hearing_aid
lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(0)
bilateral_ci 2136 bilateral_ha 2261 bimodal 791 unimodal_ci 311 unimodal_ha 320 dtype: int64
Since bilateral hearing aid is the most prevalent category, it will be used as the baseline category.
tech_index = lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(1).astype(bool)
technology_subset = lsl_dr[tech_index]
# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = technology_subset[(technology_subset.domain=='Language')
& (technology_subset.test_type=='receptive')
& (technology_subset.non_english==False)]
receptive_language_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 2 initial_assessment_arm_1 2005-2006 0 1 0 0 3 187 initial_assessment_arm_1 2014-2015 0 0 2 0 2 210 initial_assessment_arm_1 2014-2015 0 0 0 0 1 220 initial_assessment_arm_1 2012-2013 0 1 0 0 2 253 initial_assessment_arm_1 2011-2012 0 1 0 0 2 _mother_ed father_ed premature_age ... academic_year_start \ 2 2 2 8 ... 2005 187 6 6 8 ... 2014 210 5 6 7 ... 2014 220 4 6 7 ... 2012 253 6 6 8 ... 2011 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 2 0 1 0 False False 187 1 1 NaN False False 210 1 1 1 False False 220 1 1 1 False False 253 0 1 NaN False False jcih age_years unimodal_ci unimodal_ha 2 0 4.500000 False False 187 0 3.583333 False False 210 0 4.083333 False False 220 0 4.666667 True False 253 0 4.416667 False False [5 rows x 83 columns]
This is the size of the resulting dataset to be used in this analysis
receptive_language_dataset.shape
(941, 83)
mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()
receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')
receptive_language_dataset = receptive_language_dataset.set_index('study_id')
receptive_language_dataset['mean_score'] = mean_score
Specify criterion to use: jcih
, ident_by_3
, program_by_6
criterion_labels = {'jcih': 'JCIH',
'ident_by_3': 'Diagnosed by 3 months',
'program_by_6': 'Intervention by 6 months'}
criterion = 'jcih'
Proportion of missing values for each variable
covs = ['age_years', 'school', 'score', 'male', 'time',
'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)
age_years 0.00 school 0.00 score 0.00 male 0.00 time 0.04 family_inv 0.16 sib 0.06 deg_hl_below6 0.01 mother_hs 0.33 synd_or_disab 0.14 jcih 0.20 dtype: float64
Drop individuals with missing age
, since there is <1% of them
receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]
receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
receptive_language_dataset.mother_ed.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10f2cfe80>
Final analysis dataset size
receptive_language_dataset.shape
(871, 83)
missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct
mother_hs 33.3 jcih 19.3 family_inv 15.8 synd_or_disab 14.0 sib 5.5 time 3.6 deg_hl_below6 1.3 male 0.0 score 0.0 school 0.0 age_years 0.0 dtype: float64
receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')
by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
male synd_or_disab deg_hl_below6 mother_hs age_amp_missing False 0.510885 0.221106 0.469298 0.616034 True 0.478022 0.250000 0.386364 0.654206
receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')
by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
male synd_or_disab deg_hl_below6 mother_hs jcih_missing False 0.513514 0.219110 0.467811 0.626033 True 0.464286 0.260563 0.385093 0.608247
receptive_language_dataset.age_years.mean()
2.2484691924990403
from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values
n_covs = 12
def generate_model(dataset):
# Extract data to local variables
age_years = dataset.age_years.values
school = dataset.school.values
score = dataset.score.values
non_english = dataset.non_english.astype(int).values
male = dataset.male.values
sib = dataset.sib.values
synd_or_disab = dataset.synd_or_disab.values
int_outside_option = dataset.int_outside_option.values
synd_or_disab = dataset.synd_or_disab.values
study_id = dataset.index.values
ident_by_3 = dataset.ident_by_3
program_by_6 = dataset.program_by_6
bilateral_ci = dataset.bilateral_ci
bimodal = dataset.bimodal
unimodal_ci = dataset.unimodal_ci
unimodal_ha = dataset.unimodal_ha
# Transform some data
age_std = age_years - age_years.mean()
# Imputation of family involvement
n_family_inv_cats = len(dataset.family_inv.unique())
p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats),
value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked,
observed=True)
# Imputation of hearing loss
p_hl = Beta("p_hl", 1, 1, value=0.5)
hl_masked = masked_values(dataset.deg_hl_below6.values, value=np.nan)
x_hl = Bernoulli('x_hl', p_hl, value=hl_masked, observed=True)
# Imputation of maternal education
p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
# Imputation of previous disability
p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked,
observed=True)
# Imputation of siblings
n_sib_cats = len(dataset.sib.unique())
p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
mean_score = Normal("mean_score", 100, 0.001, value=80)
# Indices to school random effects
unique_schools = np.unique(school)
school_index = [list(unique_schools).index(s) for s in school]
# School random effect
sigma_school = Uniform("sigma_school", 0, 1000, value=10)
tau_school = sigma_school**-2
alpha_school = Normal("alpha_school", mu=0, tau=tau_school,
value=np.zeros(len(unique_schools)))
@deterministic
def intercept(a0=mean_score, a1=alpha_school):
"""Random intercepts"""
return a0 + a1[school_index]
# Fixed effects
beta = Normal("beta", 0, 0.001, value=[0]*n_covs)
@deterministic
def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib,
x_hl=x_hl, x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab):
# Complete predictors
X = [male, ident_by_3, program_by_6, bilateral_ci, bimodal, unimodal_ci, unimodal_ha]
# Partially-imputed predictors
X += [x_family_inv, x_sib, x_hl, x_mother_hs, x_synd_or_disab]
return b0 + np.dot(b, X) #+ a[child_index]
sigma = Uniform("sigma", 0, 1000, value=10)
tau = sigma**-2
# Data likelihood
score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
#score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
return locals()
M_reclang = MCMC(generate_model(receptive_language_dataset),
db='sqlite', name='reclang', dbmode='w')
iterations = 100000
burn = 90000
M_reclang.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1140.6 sec
labels = ['Male', 'Identified by 3 mo.',
'In Program by 6 mo.',
'Bilateral CI', 'Bimodal', 'Unimodal CI', 'Unimodal HA',
'Reduced Family Involvement',
'Sibling Count', 'Non-profound Hearing Loss',
'Mother with HS Diploma','Additional Disability']
# criterion_labels[criterion]]
Matplot.summary_plot([M_reclang.beta], rhat=False,
custom_labels=labels, x_range=(-15,15), main='Receptive Language')
Matplot.summary_plot([M_reclang.beta], rhat=False,
custom_labels=labels, x_range=(-10,10), main='Receptive Language')
M_reclang.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -0.68 1.176 0.08 [-2.799 1.596] 4.252 1.453 0.114 [ 1.429 7.149] 7.113 2.032 0.173 [ 3.463 11.173] -5.486 0.646 0.044 [-6.73 -4.247] -0.373 0.648 0.035 [-1.568 0.994] 6.699 1.233 0.086 [ 4.563 9.32 ] 5.317 1.432 0.117 [ 2.44 7.92] -6.645 1.238 0.086 [-9.174 -4.309] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -2.889 -1.506 -0.727 0.177 1.559 1.461 3.244 4.191 5.207 7.248 2.998 5.68 7.118 8.611 10.876 -6.735 -5.94 -5.487 -5.046 -4.247 -1.657 -0.813 -0.367 0.055 0.92 4.527 5.905 6.608 7.481 9.303 2.347 4.342 5.442 6.319 7.868 -8.994 -7.506 -6.679 -5.81 -4.12
expressive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='expressive') &
(technology_subset.non_english==False)]
expressive_language_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 3 initial_assessment_arm_1 2005-2006 0 1 0 0 3 188 initial_assessment_arm_1 2014-2015 0 0 2 0 2 211 initial_assessment_arm_1 2014-2015 0 0 0 0 1 221 initial_assessment_arm_1 2012-2013 0 1 0 0 2 254 initial_assessment_arm_1 2011-2012 0 1 0 0 2 _mother_ed father_ed premature_age ... academic_year_start \ 3 2 2 8 ... 2005 188 6 6 8 ... 2014 211 5 6 7 ... 2014 221 4 6 7 ... 2012 254 6 6 8 ... 2011 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 3 0 1 0 False False 188 1 1 NaN False False 211 1 1 1 False False 221 1 1 1 False False 254 0 1 NaN False False jcih age_years unimodal_ci unimodal_ha 3 0 4.500000 False False 188 0 3.583333 False False 211 0 4.083333 False False 221 0 4.666667 True False 254 0 4.416667 False False [5 rows x 83 columns]
mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score
(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)
age_years 0.00 school 0.00 score 0.00 male 0.00 time 0.04 family_inv 0.16 sib 0.06 deg_hl_below6 0.01 mother_hs 0.34 synd_or_disab 0.15 jcih 0.20 dtype: float64
expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull()
& expressive_language_dataset.int_outside_option.notnull()]
expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')
M_explang.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1111.8 sec
M_explang.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ -1.001 1.226 0.09 [-3.454 1.158] 5.961 1.332 0.096 [ 3.41 8.534] 6.543 2.242 0.198 [ 2.959 11.565] -5.016 0.669 0.044 [-6.33 -3.655] -0.745 0.69 0.041 [-2.149 0.568] 7.448 1.205 0.085 [ 5.245 9.872] 7.015 1.297 0.101 [ 4.598 9.655] -6.837 1.442 0.109 [-9.326 -3.917] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -3.349 -1.826 -0.976 -0.189 1.341 3.423 5.013 5.938 6.874 8.551 2.68 4.801 6.358 8.081 11.348 -6.33 -5.485 -5.022 -4.579 -3.655 -2.125 -1.212 -0.729 -0.273 0.609 5.23 6.616 7.425 8.267 9.869 4.459 6.145 7.04 7.88 9.585 -9.64 -7.75 -6.82 -5.854 -4.071
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-15,15))
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-10,10))
articulation_dataset = technology_subset[(technology_subset.domain=='Articulation') &
(technology_subset.non_english==False)]
articulation_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 0 initial_assessment_arm_1 2005-2006 0 1 0 0 3 184 initial_assessment_arm_1 2014-2015 0 0 2 0 2 207 initial_assessment_arm_1 2014-2015 0 0 0 0 1 286 initial_assessment_arm_1 2014-2015 0 0 0 0 1 412 initial_assessment_arm_1 NaN 0 0 0 0 1 _mother_ed father_ed premature_age ... academic_year_start \ 0 2 2 8 ... 2005 184 6 6 8 ... 2014 207 5 6 7 ... 2014 286 6 6 8 ... 2014 412 6 6 9 ... NaN deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 0 0 1 0 False False 184 1 1 NaN False False 207 1 1 1 False False 286 0 1 NaN True False 412 0 0 NaN False False jcih age_years unimodal_ci unimodal_ha 0 0 4.500000 False False 184 0 3.583333 False False 207 0 4.083333 False False 286 0 3.916667 False False 412 NaN 3.750000 False False [5 rows x 83 columns]
mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score
(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)
age_years 0.00 school 0.00 score 0.00 male 0.00 time 0.04 family_inv 0.17 sib 0.05 deg_hl_below6 0.01 mother_hs 0.32 synd_or_disab 0.14 jcih 0.21 dtype: float64
articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
& articulation_dataset.int_outside_option.notnull()]
articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')
M_articulation.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1003.0 sec
Matplot.summary_plot([M_articulation.beta],
custom_labels=labels, rhat=False, main='Articulation', x_range=(-15,15))
Matplot.summary_plot([M_articulation.beta],
custom_labels=labels, rhat=False, main='Articulation', x_range=(-10,10))
M_articulation.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.842 1.276 0.087 [-1.699 3.333] 4.066 1.714 0.142 [ 0.955 7.342] 3.812 2.447 0.214 [-0.984 8.727] -4.393 0.66 0.042 [-5.713 -3.201] 0.048 0.77 0.047 [-1.486 1.531] 5.537 1.507 0.116 [ 2.681 8.384] 5.125 1.572 0.123 [ 2.074 8.148] -8.13 1.489 0.117 [-11.165 -5.307] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -1.565 -0.043 0.826 1.679 3.548 0.955 2.834 3.962 5.29 7.373 -1.285 2.392 3.897 5.403 8.56 -5.698 -4.85 -4.384 -3.921 -3.179 -1.532 -0.442 0.065 0.556 1.486 2.69 4.476 5.591 6.588 8.406 2.171 3.999 5.132 6.195 8.345 -11.128 -9.092 -8.05 -7.171 -5.227
expressive_vocab_dataset = technology_subset[(technology_subset.domain=='Expressive Vocabulary') &
(technology_subset.non_english==False)]
expressive_vocab_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 1 initial_assessment_arm_1 2005-2006 0 1 0 0 3 10 initial_assessment_arm_1 2007-2008 0 0 0 0 3 185 initial_assessment_arm_1 2014-2015 0 0 2 0 2 208 initial_assessment_arm_1 2014-2015 0 0 0 0 1 228 initial_assessment_arm_1 2012-2013 0 0 0 0 2 _mother_ed father_ed premature_age ... academic_year_start \ 1 2 2 8 ... 2005 10 4 4 8 ... 2007 185 6 6 8 ... 2014 208 5 6 7 ... 2014 228 6 6 8 ... 2012 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 1 0 1 0 False False 10 0 1 1 False False 185 1 1 NaN False False 208 1 1 1 False False 228 1 1 NaN False False jcih age_years unimodal_ci unimodal_ha 1 0 4.500000 False False 10 0 4.583333 False False 185 0 3.583333 False False 208 0 4.083333 False False 228 0 4.000000 False False [5 rows x 83 columns]
mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score
expressive_vocab_dataset[covs].isnull().sum()
age_years 4 school 0 score 0 male 0 time 40 family_inv 153 sib 49 deg_hl_below6 8 mother_hs 312 synd_or_disab 124 jcih 200 dtype: int64
expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull()
& expressive_vocab_dataset.male.notnull()
& expressive_vocab_dataset.int_outside_option.notnull()]
expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')
M_expressive_vocab.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1128.2 sec
Matplot.summary_plot([M_expressive_vocab.beta],
custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-15,15))
Matplot.summary_plot([M_expressive_vocab.beta],
custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))
M_expressive_vocab.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 1.343 1.236 0.088 [-0.896 3.726] 4.361 1.337 0.099 [ 1.641 6.673] 6.004 2.078 0.182 [ 2.042 10.622] -5.315 0.611 0.039 [-6.396 -3.969] -1.217 0.677 0.039 [-2.495 0.116] 5.996 1.353 0.104 [ 3.464 8.742] 5.991 1.717 0.15 [ 2.596 9.108] -5.925 1.275 0.093 [-8.425 -3.711] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -1.027 0.481 1.334 2.205 3.651 1.677 3.446 4.394 5.334 6.802 1.964 4.643 5.97 7.341 10.556 -6.473 -5.72 -5.344 -4.942 -4.004 -2.54 -1.672 -1.213 -0.765 0.083 3.464 5.052 5.995 6.892 8.742 2.632 4.831 6.057 7.19 9.215 -8.301 -6.813 -5.973 -4.975 -3.486
Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects',
x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.summary_plot(M_expressive_vocab.alpha_school, custom_labels=[' ']*42, main='Expressive vocabulary school effects',
x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
receptive_vocab_dataset = technology_subset[(technology_subset.domain=='Receptive Vocabulary') &
(technology_subset.non_english==False)]
receptive_vocab_dataset.head()
redcap_event_name academic_year hl male _race prim_lang sib \ 11 initial_assessment_arm_1 2007-2008 0 0 0 0 3 186 initial_assessment_arm_1 2014-2015 0 0 2 0 2 209 initial_assessment_arm_1 2014-2015 0 0 0 0 1 229 initial_assessment_arm_1 2012-2013 0 0 0 0 2 288 initial_assessment_arm_1 2014-2015 0 0 0 0 1 _mother_ed father_ed premature_age ... academic_year_start \ 11 4 4 8 ... 2007 186 6 6 8 ... 2014 209 5 6 7 ... 2014 229 6 6 8 ... 2012 288 6 6 8 ... 2014 deg_hl_below6 int_outside_option mother_hs ident_by_3 program_by_6 \ 11 0 1 1 False False 186 1 1 NaN False False 209 1 1 1 False False 229 1 1 NaN False False 288 0 1 NaN True False jcih age_years unimodal_ci unimodal_ha 11 0 4.583333 False False 186 0 3.583333 False False 209 0 4.083333 False False 229 0 4.000000 False False 288 0 3.916667 False False [5 rows x 83 columns]
mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score
receptive_vocab_dataset[covs].isnull().sum()
age_years 4 school 0 score 0 male 0 time 37 family_inv 158 sib 49 deg_hl_below6 10 mother_hs 318 synd_or_disab 125 jcih 202 dtype: int64
receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() &
receptive_vocab_dataset.male.notnull() &
receptive_vocab_dataset.int_outside_option.notnull()]
receptive_vocab_dataset.age_int.hist(bins=40)
<matplotlib.axes._subplots.AxesSubplot at 0x10bc0dc18>
receptive_vocab_dataset.shape
(922, 83)
receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')
M_receptive_vocab.sample(iterations, burn)
[-----------------100%-----------------] 100000 of 100000 complete in 1251.2 sec
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
custom_labels=labels, main='Receptive Vocabulary', x_range=(-15,15))
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
custom_labels=labels, main='Receptive Vocabulary', x_range=(-10,10))
M_receptive_vocab.beta.summary()
beta: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.309 1.085 0.076 [-1.857 2.387] 3.26 1.406 0.113 [ 0.746 6.212] 6.076 2.06 0.181 [ 2.363 10.008] -4.999 0.563 0.035 [-6.112 -3.92 ] -1.181 0.578 0.031 [-2.359 -0.128] 5.245 1.139 0.083 [ 2.797 7.29 ] 4.356 1.361 0.108 [ 1.676 7.031] -6.92 1.236 0.088 [-9.163 -4.642] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| -1.809 -0.41 0.28 1.029 2.451 0.402 2.339 3.268 4.256 5.909 2.378 4.543 5.985 7.562 10.045 -6.064 -5.394 -5.019 -4.635 -3.857 -2.344 -1.57 -1.165 -0.815 -0.078 2.974 4.471 5.269 6.044 7.483 1.475 3.504 4.359 5.265 6.956 -9.03 -7.824 -7.002 -6.073 -4.409
import sqlite3
db_files = !ls *sqlite
articulation = sqlite3.connect("articulation.sqlite")
c_artic = articulation.cursor()
c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')
<sqlite3.Cursor at 0x1151881f0>
sigma_school_artic = np.ravel(c_artic.fetchall())
sigma_school_artic.mean()
5.3893053284999999
for dbname in db_files:
db = sqlite3.connect(dbname)
cur = db.cursor()
cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
trace = np.ravel(cur.fetchall())
trace.sort()
print('\n'+dbname)
print(np.median(trace))
print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])
articulation.sqlite 5.468853 [ 2.593277 8.103165] explang.sqlite 5.3118985 [ 3.703665 7.366459] expressive_vocab.sqlite 8.231873 [ 6.286766 11.080385] receptive_vocab.sqlite 6.559168 [ 4.745076 9.006935] reclang.sqlite 5.470786 [ 3.816387 7.789314]