#!/usr/bin/env python # coding: utf-8 # # Technology Effects Analysis # In[1]: # Import modules and set options get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 20) pd.set_option('display.max_rows', 100) # Import data # In[2]: lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0) # In[3]: lsl_dr.head() # Indicator for non-profound hearing loss # In[4]: lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6 lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan # Indicator for first intervention outside OPTION # In[5]: lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan # Indicator for high school graduation of mother # In[6]: lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1 lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None # Joint commission benchmark # In[7]: lsl_dr.onset_1.unique() # In[8]: lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3 lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6 lsl_dr['jcih'] = ident_by_3 & program_by_6 lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None # Create age in years variable # In[9]: lsl_dr['age_years'] = lsl_dr.age/12. # Retain only 4-year-olds # In[10]: lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)] # In[11]: lsl_dr.shape # In[12]: lsl_dr.study_id.unique().shape # ## Summary Statistics for Paper # In[13]: unique_students = lsl_dr.drop_duplicates(subset='study_id') # In[14]: unique_students.shape # In[15]: unique_students.age.describe() # In[16]: unique_students.male.mean() # In[17]: unique_students.male.value_counts() # In[18]: unique_students.sib.describe() # In[19]: (unique_students.degree_hl == 6).describe() # In[20]: unique_students.deg_hl_below6.isnull().sum() # In[21]: unique_students.deg_hl_below6.value_counts() # In[22]: ((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe() # In[23]: unique_students.mother_hs.describe() # In[24]: unique_students.synd_or_disab.describe() # In[25]: unique_students.groupby('male').program_by_6.value_counts() # In[26]: unique_students.groupby('sib').program_by_6.value_counts() # In[27]: unique_students.groupby('deg_hl_below6').program_by_6.value_counts() # In[28]: unique_students.groupby('mother_hs').program_by_6.value_counts() # In[29]: unique_students.groupby('synd_or_disab').program_by_6.value_counts() # In[30]: unique_students.groupby('ident_by_3').program_by_6.value_counts() # In[31]: unique_students.groupby('jcih').program_by_6.value_counts() # In[32]: grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years'] fig, ax = plt.subplots() for k, v in grouped_age: v.hist(label=k, alpha=.75, ax=ax, grid=False) ax.legend() plt.xlabel('Age (years)'); plt.ylabel('Frequency'); # In[33]: students_data = lsl_dr.copy() # In[34]: students_data['school_id'] = students_data.school.replace({int(s):i for i,s in enumerate(students_data.school.unique())}) # In[35]: students_data = students_data[students_data.domain=='Language'] # In[36]: students_data.shape # In[37]: option_scores = students_data.drop_duplicates('study_id', take_last=True)[['score', 'school_id', 'deg_hl_below6', 'mother_hs']].dropna() # In[38]: option_scores = option_scores.astype(int) # In[39]: option_scores.to_csv('option_scores.csv', index=False) # Create technology indicators # In[40]: unimodal = ~lsl_dr[[u'bilateral_ci', u'bilateral_ha', u'bimodal']].sum(1).astype(bool) # In[41]: lsl_dr['unimodal_ci'] = unimodal & lsl_dr.cochlear lsl_dr['unimodal_ha'] = unimodal & lsl_dr.hearing_aid # In[42]: lsl_dr[[u'bilateral_ci', u'bilateral_ha', u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(0) # Since bilateral hearing aid is the most prevalent category, it will be used as the baseline category. # In[43]: tech_index = lsl_dr[[u'bilateral_ci', u'bilateral_ha', u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(1).astype(bool) technology_subset = lsl_dr[tech_index] # In[44]: technology_subset.score.max() # In[45]: ((unique_students.cochlear>0) & (unique_students.age_amp.notnull())).sum() # ## Receptive Language Test Score Model # In[46]: # Filter domain and English-only, and appropriate bilateral subset receptive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='receptive') & (technology_subset.non_english==False)] receptive_language_dataset.head() # This is the size of the resulting dataset to be used in this analysis # In[47]: receptive_language_dataset.shape # In[48]: mean_score = receptive_language_dataset.groupby('study_id')['score'].mean() # In[49]: receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id') # In[50]: receptive_language_dataset = receptive_language_dataset.set_index('study_id') # In[51]: receptive_language_dataset['mean_score'] = mean_score # Specify criterion to use: `jcih`, `ident_by_3`, `program_by_6` # In[52]: criterion_labels = {'jcih': 'JCIH', 'ident_by_3': 'Diagnosed by 3 months', 'program_by_6': 'Intervention by 6 months'} criterion = 'jcih' # Proportion of missing values for each variable # In[53]: covs = ['age_years', 'school', 'score', 'male', 'time', 'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion] (receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2) # Drop individuals with missing `age`, since there is <1% of them # In[54]: receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()] # In[55]: receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[56]: receptive_language_dataset.mother_ed.hist() # Final analysis dataset size # In[57]: receptive_language_dataset.shape # In[58]: missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1) missing_pct.sort(ascending=False) missing_pct # In[59]: receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull() by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing') # In[60]: by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs', 'non_english'].mean() # In[61]: receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull() by_jcih_missing = receptive_language_dataset.groupby('jcih_missing') # In[62]: by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs', 'non_english'].mean() # In[63]: receptive_language_dataset.age_years.mean() # In[66]: from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T from numpy.ma import masked_values n_covs = 11 def generate_model(dataset): # Extract data to local variables age_years = dataset.age_years.values school = dataset.school.values score = dataset.score.values non_english = dataset.non_english.astype(int).values male = dataset.male.values sib = dataset.sib.values synd_or_disab = dataset.synd_or_disab.values int_outside_option = dataset.int_outside_option.values synd_or_disab = dataset.synd_or_disab.values study_id = dataset.index.values ident_by_3 = dataset.ident_by_3 program_by_6 = dataset.program_by_6 bilateral_ci = dataset.bilateral_ci bimodal = dataset.bimodal unimodal_ci = dataset.unimodal_ci unimodal_ha = dataset.unimodal_ha # Transform some data age_std = age_years - age_years.mean() # Imputation of family involvement n_family_inv_cats = len(dataset.family_inv.unique()) p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats), value=[1./n_family_inv_cats]*(n_family_inv_cats-1)) family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5) x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked, observed=True) # Imputation of maternal education p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5) mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan) x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True) # Imputation of previous disability p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5) synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan) x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked, observed=True) # Imputation of siblings n_sib_cats = len(dataset.sib.unique()) p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1)) sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5) x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True) mean_score = Normal("mean_score", 100, 0.001, value=80) # Indices to school random effects unique_schools = np.unique(school) school_index = [list(unique_schools).index(s) for s in school] # School random effect sigma_school = Uniform("sigma_school", 0, 1000, value=10) tau_school = sigma_school**-2 alpha_school = Normal("alpha_school", mu=0, tau=tau_school, value=np.zeros(len(unique_schools))) @deterministic def intercept(a0=mean_score, a1=alpha_school): """Random intercepts""" return a0 + a1[school_index] # Fixed effects beta = Normal("beta", 0, 0.001, value=[0]*n_covs) @deterministic def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib, x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab): # Complete predictors X = [male, ident_by_3, program_by_6, bilateral_ci, bimodal, unimodal_ci, unimodal_ha] # Partially-imputed predictors X += [x_family_inv, x_sib, x_mother_hs, x_synd_or_disab] return b0 + np.dot(b, X) #+ a[child_index] sigma = Uniform("sigma", 0, 1000, value=10) tau = sigma**-2 # Data likelihood score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True) #score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score) return locals() # In[67]: M_reclang = MCMC(generate_model(receptive_language_dataset), db='sqlite', name='reclang', dbmode='w') # In[68]: iterations = 100000 burn = 90000 # In[69]: M_reclang.sample(iterations, burn) # In[70]: labels = ['Male', 'Identified by 3 mo.', 'In Program by 6 mo.', 'Bilateral CI', 'Bimodal', 'Unimodal CI', 'Unimodal HA', 'Reduced Family Involvement', 'Sibling Count', 'Non-profound Hearing Loss', 'Mother with HS Diploma','Additional Disability'] # criterion_labels[criterion]] # In[71]: Matplot.summary_plot([M_reclang.beta], rhat=False, custom_labels=labels, x_range=(-15,15), main='Receptive Language') # In[73]: M_reclang.beta.summary() # ## Expressive Language Model # In[74]: expressive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='expressive') & (technology_subset.non_english==False)] expressive_language_dataset.head() # In[75]: mean_score = expressive_language_dataset.groupby('study_id')['score'].mean() expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id') expressive_language_dataset['mean_score'] = mean_score # In[76]: (expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2) # In[77]: expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull() & expressive_language_dataset.int_outside_option.notnull()] # In[78]: expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[79]: M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w') # In[80]: M_explang.sample(iterations, burn) # In[81]: M_explang.beta.summary() # In[82]: Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-15,15)) # ## Articulation Model # In[84]: articulation_dataset = technology_subset[(technology_subset.domain=='Articulation') & (technology_subset.non_english==False)] articulation_dataset.head() # In[85]: mean_score = articulation_dataset.groupby('study_id')['score'].mean() articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id') articulation_dataset['mean_score'] = mean_score # In[86]: (articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2) # In[87]: articulation_dataset = articulation_dataset[articulation_dataset.age.notnull() & articulation_dataset.int_outside_option.notnull()] # In[88]: articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[89]: M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w') # In[90]: M_articulation.sample(iterations, burn) # In[91]: Matplot.summary_plot([M_articulation.beta], custom_labels=labels, rhat=False, main='Articulation', x_range=(-15,15)) # In[93]: M_articulation.beta.summary() # ## Expressive Vocabulary Model # In[94]: expressive_vocab_dataset = technology_subset[(technology_subset.domain=='Expressive Vocabulary') & (technology_subset.non_english==False)] expressive_vocab_dataset.head() # In[95]: mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean() expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id') expressive_vocab_dataset['mean_score'] = mean_score # In[96]: expressive_vocab_dataset[covs].isnull().sum() # In[97]: expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull() & expressive_vocab_dataset.male.notnull() & expressive_vocab_dataset.int_outside_option.notnull()] # In[98]: expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[99]: M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w') # In[100]: M_expressive_vocab.sample(iterations, burn) # In[101]: Matplot.summary_plot([M_expressive_vocab.beta], custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-15,15)) # In[102]: Matplot.summary_plot([M_expressive_vocab.beta], custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10)) # In[103]: M_expressive_vocab.beta.summary() # In[104]: Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects', x_range=(-25, 25)) # ## Receptive Vocabulary Model # In[106]: receptive_vocab_dataset = technology_subset[(technology_subset.domain=='Receptive Vocabulary') & (technology_subset.non_english==False)] receptive_vocab_dataset.head() # In[107]: mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean() receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id') receptive_vocab_dataset['mean_score'] = mean_score # In[108]: receptive_vocab_dataset[covs].isnull().sum() # In[109]: receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() & receptive_vocab_dataset.male.notnull() & receptive_vocab_dataset.int_outside_option.notnull()] # In[110]: receptive_vocab_dataset.age_int.hist(bins=40) # In[111]: receptive_vocab_dataset.shape # In[112]: receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6)) plt.xlabel('Standard Scores'); plt.ylabel('Frequency'); # In[113]: M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w') # In[114]: M_receptive_vocab.sample(iterations, burn) # In[115]: #order = np.argsort(M.beta.stats()['mean'])[::-1] Matplot.summary_plot([M_receptive_vocab.beta], rhat=False, custom_labels=labels, main='Receptive Vocabulary', x_range=(-15,15)) # In[117]: M_receptive_vocab.beta.summary() # ## Database queries # In[118]: import sqlite3 # In[119]: db_files = get_ipython().getoutput('ls *sqlite') # In[120]: articulation = sqlite3.connect("articulation.sqlite") # In[121]: c_artic = articulation.cursor() # In[122]: c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0') # In[123]: sigma_school_artic = np.ravel(c_artic.fetchall()) # In[124]: sigma_school_artic.mean() # In[125]: for dbname in db_files: db = sqlite3.connect(dbname) cur = db.cursor() cur.execute('SELECT v1 FROM sigma_school WHERE trace==0') trace = np.ravel(cur.fetchall()) trace.sort() print('\n'+dbname) print(np.median(trace)) print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])