#!/usr/bin/env python # coding: utf-8 # # Pediatrics Paper Analysis # > All studies with successful outcomes reported for early-identified children who are deaf or hard of hearing have intervention provided by specialists who are trained in parent-infant intervention services.[12,90,97] Early intervention programs should develop mechanisms to ensure that early intervention professionals have special skills necessary for providing families with the highest quality of service specific to children with hearing loss. Professionals with a background in deaf education, audiology, and speech-language pathology will typically have the skills needed for providing intervention services. Professionals should be highly qualified in their respective fields and should be skilled communicators who are knowledgeable and sensitive to the importance of enhancing families' strengths and supporting their priorities. When early intervention professionals have knowledge of the principles of adult learning, it increases their success with parents and other professionals. # # TJC must be competent to provide services. # # **References**: # 1. Yoshinaga-Itano C, Sedey AL, Coulter DK, Mehl AL. Language of early- and later-identified children with hearing loss. Pediatrics.1998;102 :1161โ€“ 1171 # 2. Moeller MP. Early intervention and language development in children who are deaf and hard of hearing. Pediatrics.2000;106(3) :e43 . Available at: www.pediatrics.org/cgi/content/full/106/3/e43 # 3. Calderon R. Parental involvement in deaf children's education programs as a predictor of child's language, early reading, and social-emotional development. J Deaf Stud Deaf Educ.2000;5 :140โ€“ 155 # # # Purpose of the study is to examine the relationship between highly qualified providers, age of enrollment in intervention, and speech and language outcomes at 4 years of age in a group of children enrolled in school programs that specialize in the development of listening and spoken language. # # - Does age of enrollment in intervention impact articulation, vocabulary, and language scores? (age_int) # - Does place of enrollment in intervention impact articulation, vocabulary, and language scores? (age and age_int) # - Is there an interaction effect between the place and age? # - What other factors contribute to the variance? # - Parental involvement # - Degree of hearing loss # - Motherโ€™s education level (mother_ed) # - Prematurity (premature_weeks) # - Gender (gender) # - Language (prim_lang) # - Number of children in the home (sib) # - Age when first amplified (age_amp) # - Additional disability (secondary_diagnosis, known_synd) # In[1]: # Import modules and set options get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 20) pd.set_option('display.max_rows', 100) # Import data # In[2]: lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0) # In[3]: lsl_dr.head() # Indicator for non-profound hearing loss # In[4]: lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6 lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan # Indicator for first intervention outside OPTION # In[5]: lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan # Indicator for high school graduation of mother # In[6]: lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1 lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None # Joint commission benchmark # In[7]: lsl_dr.onset_1.unique() # In[8]: lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3 lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6 lsl_dr['jcih'] = ident_by_3 & program_by_6 lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None # Create age in years variable # In[9]: lsl_dr['age_years'] = lsl_dr.age/12. # Retain only 4-year-olds # In[10]: lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)] # In[11]: lsl_dr.shape # In[12]: lsl_dr.study_id.unique().shape # ## Summary Statistics for Paper # In[13]: unique_students = lsl_dr.drop_duplicates(subset='study_id') # In[14]: unique_students.shape # In[15]: unique_students.age.describe() # In[16]: unique_students.male.mean() # In[17]: unique_students.sib.describe() # In[18]: (unique_students.degree_hl == 6).describe() # In[19]: ((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe() # In[20]: unique_students.mother_hs.describe() # In[21]: unique_students.synd_or_disab.describe() # In[22]: grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years'] fig, ax = plt.subplots() for k, v in grouped_age: v.hist(label=k, alpha=.75, ax=ax, grid=False) ax.legend() plt.xlabel('Age (years)'); plt.ylabel('Frequency'); # ## Receptive Language Test Score Model # In[23]: # Filter domain and English-only, and appropriate bilateral subset receptive_language_dataset = lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='receptive') & (lsl_dr.non_english==False)] receptive_language_dataset.head() # This is the size of the resulting dataset to be used in this analysis # In[24]: receptive_language_dataset.shape # In[25]: mean_score = receptive_language_dataset.groupby('study_id')['score'].mean() # In[26]: receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id') # In[27]: receptive_language_dataset = receptive_language_dataset.set_index('study_id') # In[28]: receptive_language_dataset['mean_score'] = mean_score # Specify criterion to use: `jcih`, `ident_by_3`, `program_by_6` # In[29]: criterion_labels = {'jcih': 'JCIH', 'ident_by_3': 'Diagnosed by 3 months', 'program_by_6': 'Intervention by 6 months'} criterion = 'jcih' # Proportion of missing values for each variable # In[30]: covs = ['age_years', 'school', 'score', 'male', 'time', 'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion] (receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2) # Drop individuals with missing `age`, since there is <1% of them # In[31]: receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()] # In[32]: receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[33]: receptive_language_dataset.mother_ed.hist() # Final analysis dataset size # In[34]: receptive_language_dataset.shape # In[35]: missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1) missing_pct.sort(ascending=False) missing_pct # In[36]: receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull() by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing') # In[37]: by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs', 'non_english'].mean() # In[38]: for col in ['age', 'score', 'degree_hl', 'family_inv', 'age_int']: try: _ = by_age_amp_missing.boxplot(column=col) except: pass # In[39]: receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull() by_jcih_missing = receptive_language_dataset.groupby('jcih_missing') # In[40]: by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs', 'non_english'].mean() # In[41]: for col in ['age', 'score', 'degree_hl', 'family_inv', 'age_int']: _ = by_jcih_missing.boxplot(column=col) # In[43]: receptive_language_dataset.age_years.mean() # In[57]: from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T from numpy.ma import masked_values n_covs = 9 def generate_model(dataset): # Extract data to local variables age_years = dataset.age_years.values school = dataset.school.values score = dataset.score.values non_english = dataset.non_english.astype(int).values male = dataset.male.values sib = dataset.sib.values synd_or_disab = dataset.synd_or_disab.values int_outside_option = dataset.int_outside_option.values synd_or_disab = dataset.synd_or_disab.values study_id = dataset.index.values ident_by_3 = dataset.ident_by_3 program_by_6 = dataset.program_by_6 # Transform some data age_std = age_years - age_years.mean() # Imputation of family involvement n_family_inv_cats = len(dataset.family_inv.unique()) p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats), value=[1./n_family_inv_cats]*(n_family_inv_cats-1)) family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5) x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked, observed=True) # Imputation of hearing loss # p_crit = Beta("p_%s" % criterion, 1, 1, value=0.5) # crit_masked = masked_values(dataset[criterion].values, value=np.nan) # x_crit = Bernoulli('x_%s' % criterion, p_crit, value=crit_masked, observed=True) # Imputation of hearing loss p_hl = Beta("p_hl", 1, 1, value=0.5) hl_masked = masked_values(dataset.deg_hl_below6.values, value=np.nan) x_hl = Bernoulli('x_hl', p_hl, value=hl_masked, observed=True) # Imputation of maternal education p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5) mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan) x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True) # Imputation of previous disability p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5) synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan) x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked, observed=True) # Imputation of siblings n_sib_cats = len(dataset.sib.unique()) p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1)) sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5) x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True) # Imputation of time # n_time_cats = len(dataset.time.unique()) # p_time = Dirichlet("p_time", np.ones(n_time_cats), value=[1./n_time_cats]*(n_time_cats-1)) # time_masked = masked_values(dataset.time.fillna(0.5), value=0.5) # x_time = Categorical('x_time', p_time, value=time_masked, observed=True) mean_score = Normal("mean_score", 100, 0.001, value=80) # Indices to school random effects unique_schools = np.unique(school) school_index = [list(unique_schools).index(s) for s in school] # School random effect sigma_school = Uniform("sigma_school", 0, 1000, value=10) tau_school = sigma_school**-2 alpha_school = Normal("alpha_school", mu=0, tau=tau_school, value=np.zeros(len(unique_schools))) # Indices to child random effects unique_child = np.unique(study_id) child_index = [list(unique_child).index(s) for s in study_id] # Robust individual random effect nu_child = Exponential("nu_child", 0.1, value=10) alpha_child = T("alpha_child", nu=nu_child, value=np.zeros(len(unique_child))) @deterministic def intercept(a0=mean_score, a1=alpha_school): """Random intercepts""" return a0 + a1[school_index] # Fixed effects beta = Normal("beta", 0, 0.001, value=[0]*n_covs) @deterministic def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib, x_hl=x_hl, x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab, a=alpha_child): # Complete predictors X = [age_std, male, ident_by_3, program_by_6] # Partially-imputed predictors X += [x_family_inv, x_sib, x_hl, x_mother_hs, x_synd_or_disab] return b0 + np.dot(b, X) + a[child_index] sigma = Uniform("sigma", 0, 1000, value=10) tau = sigma**-2 # Data likelihood score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True) #score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score) # Predictions: # Girl, average age, ident by 3 prog by 6, normal family involvement, # 2 siblings, non-profound hearing loss, # mother with HS, no previous disability, JCIH x1 = [0, 0, 1, 1, 0, 2, 1, 1, 0] # Boy, 6 months younger than average, ident by 3 prog by 6, normal family involvement, # ident by 3 prog by 6, no siblings, profound hearing loss, # mother without HS diploma, previous disability x2 = [-0.5, 1, 1, 1, 0, 0, 0, 0, 1] # Boy, 1.5 years older than average, not ident by 3 or prog by 6, poor family involvement, # 1 sibling, non-profound hearing loss, # mother without HS diploma, previous disability x3 = [1.5, 1, 0, 0, 2, 1, 1, 0, 1] # Girl, 6 months older than average, ident by 3 prog by 6, impaired family involvement, # 3 siblings, profound hearing loss, # mother with HS diploma, no previous disability x4 = [0.5, 1, 1, 1, 1, 3, 0, 1, 0] X = np.c_[x1, x2, x3, x4] theta_pred = Lambda('theta_pred', lambda b=beta, mu=mean_score: mu + np.dot(X.T, b)) predictions = Normal('predictions', mu=theta_pred, tau=tau) return locals() # In[58]: M_reclang = MCMC(generate_model(receptive_language_dataset), db='sqlite', name='reclang', dbmode='w') # In[59]: iterations = 100000 burn = 90000 # In[60]: M_reclang.sample(iterations, burn) # In[65]: labels = ['Age of Enrollment', 'Male', 'Identified by 3 mo.', 'In Program by 6 mo.', 'Reduced Family Involvement', 'Sibling Count', 'Non-profound Hearing Loss', 'Mother with HS Diploma','Additional Disability', ] # criterion_labels[criterion]] # In[66]: Matplot.summary_plot([M_reclang.beta], rhat=False, custom_labels=labels, x_range=(-10,10), main='Receptive Language') # In[49]: Matplot.summary_plot([M_reclang.beta], rhat=False, custom_labels=labels, x_range=(-10,10), main='Receptive Language') # In[63]: M_reclang.beta.summary() # In[64]: Matplot.summary_plot([M_reclang.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)]) # ## Expressive Language Model # In[67]: expressive_language_dataset = lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='expressive') & (lsl_dr.non_english==False)] expressive_language_dataset.head() # In[68]: mean_score = expressive_language_dataset.groupby('study_id')['score'].mean() expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id') expressive_language_dataset['mean_score'] = mean_score # In[69]: (expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2) # In[70]: expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull() & expressive_language_dataset.int_outside_option.notnull()] # In[71]: expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[72]: M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w') # In[73]: M_explang.sample(iterations, burn) # In[74]: M_explang.beta.summary() # In[75]: Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-10,10)) # ### Making predictions # # Though these models were constructed primarily for inference, that is, to provide insights on the relative effects of a suite of factors on expected test scores, there is the potential for using them to predict speech-language outcomes. To explore this, we contrived four predictive scenarios, whereby children with different values of the set of predictor variables were specified, and predictions for them were generated by our models. # # The Bayesian hierarchical modeling approach makes generating predictions straightforward, via the posterior predictive distribution, which generates predicted values of outcomes conditional on the outcomes observed in the dataset. The variation of these predictions includes both the residual uncertainty in the model parameters and the aleatory sampling variability of the test outcomes. Our test cases were specified as follows: # # **Case 1** ๐Ÿ‘ง # # * female # * average age # * normal family involvement # * 2 siblings # * non-profound hearing loss # * mother with HS # * no previous disability # * JCIH criterion met # # **Case 2** ๐Ÿ‘ฆ # # * male # * 6 months younger than average # * normal family involvement # * no siblings # * profound hearing loss # * mother without HS diploma # * previous disability # * JCIH criterion met # # **Case 3** ๐Ÿ‘ถ # # * male # * 1.5 years older than average # * poor family involvement # * 1 sibling # * non-profound hearing loss # * mother without HS diploma # * previous disability # * no JCIH # # **Case 4** ๐Ÿ™Ž # # * female # * 6 months older than average # * impaired family involvement # * 3 siblings # * profound hearing loss # * mother with HS diploma # * no previous disability # * JCIH criterion met # In[76]: M_reclang.predictions.summary() # In[77]: Matplot.summary_plot([M_reclang.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)]) # In[78]: M_explang.predictions.summary() # ## Articulation Model # In[80]: articulation_dataset = lsl_dr[(lsl_dr.domain=='Articulation') & (lsl_dr.non_english==False)] articulation_dataset.head() # In[81]: mean_score = articulation_dataset.groupby('study_id')['score'].mean() articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id') articulation_dataset['mean_score'] = mean_score # In[82]: (articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2) # In[83]: articulation_dataset = articulation_dataset[articulation_dataset.age.notnull() & articulation_dataset.int_outside_option.notnull()] # In[84]: articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[85]: M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w') # In[86]: M_articulation.sample(iterations, burn) # In[87]: Matplot.summary_plot([M_articulation.beta], custom_labels=labels, rhat=False, main='Articulation', x_range=(-10,10)) # In[88]: Matplot.summary_plot([M_articulation.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)]) # In[89]: M_articulation.beta.summary() # ## Expressive Vocabulary Model # In[90]: expressive_vocab_dataset = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.non_english==False)] expressive_vocab_dataset.head() # In[91]: mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean() expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id') expressive_vocab_dataset['mean_score'] = mean_score # In[92]: expressive_vocab_dataset[covs].isnull().sum() # In[93]: expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull() & expressive_vocab_dataset.male.notnull() & expressive_vocab_dataset.int_outside_option.notnull()] # In[94]: expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[95]: M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w') # In[96]: M_expressive_vocab.sample(iterations, burn) # In[97]: Matplot.summary_plot([M_expressive_vocab.beta], custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10)) # In[98]: M_expressive_vocab.beta.summary() # In[99]: Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects', x_range=(-25, 25)) # In[100]: Matplot.summary_plot(M_expressive_vocab.alpha_school, custom_labels=[' ']*42, main='Expressive vocabulary school effects', x_range=(-25, 25)) # ## Receptive Vocabulary Model # In[101]: receptive_vocab_dataset = lsl_dr[(lsl_dr.domain=='Receptive Vocabulary') & (lsl_dr.non_english==False)] receptive_vocab_dataset.head() # In[102]: mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean() receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id') receptive_vocab_dataset['mean_score'] = mean_score # In[103]: receptive_vocab_dataset[covs].isnull().sum() # In[104]: receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() & receptive_vocab_dataset.male.notnull() & receptive_vocab_dataset.int_outside_option.notnull()] # In[105]: receptive_vocab_dataset.age_int.hist(bins=40) # In[106]: receptive_vocab_dataset.shape # In[107]: receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[108]: M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w') # In[109]: M_receptive_vocab.sample(iterations, burn) # In[110]: #order = np.argsort(M.beta.stats()['mean'])[::-1] Matplot.summary_plot([M_receptive_vocab.beta], rhat=False, custom_labels=labels, main='Receptive Vocabulary', x_range=(-10,10)) # In[111]: M_receptive_vocab.beta.summary() # ## Database queries # In[97]: import sqlite3 # In[98]: db_files = get_ipython().getoutput('ls *sqlite') # In[99]: articulation = sqlite3.connect("articulation.sqlite") # In[100]: c_artic = articulation.cursor() # In[101]: c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0') # In[102]: sigma_school_artic = np.ravel(c_artic.fetchall()) # In[103]: sigma_school_artic.mean() # In[104]: for dbname in db_files: db = sqlite3.connect(dbname) cur = db.cursor() cur.execute('SELECT v1 FROM sigma_school WHERE trace==0') trace = np.ravel(cur.fetchall()) trace.sort() print('\n'+dbname) print(np.median(trace)) print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])