#!/usr/bin/env python
# coding: utf-8

# # Pediatrics Paper Analysis

# > All studies with successful outcomes reported for early-identified children who are deaf or hard of hearing have intervention provided by specialists who are trained in parent-infant intervention services.[12,90,97] Early intervention programs should develop mechanisms to ensure that early intervention professionals have special skills necessary for providing families with the highest quality of service specific to children with hearing loss. Professionals with a background in deaf education, audiology, and speech-language pathology will typically have the skills needed for providing intervention services. Professionals should be highly qualified in their respective fields and should be skilled communicators who are knowledgeable and sensitive to the importance of enhancing families' strengths and supporting their priorities. When early intervention professionals have knowledge of the principles of adult learning, it increases their success with parents and other professionals.
# 
# TJC must be competent to provide services.
# 
# **References**:
# 1. Yoshinaga-Itano C, Sedey AL, Coulter DK, Mehl AL. Language of early- and later-identified children with hearing loss. Pediatrics.1998;102 :1161– 1171
# 2. Moeller MP. Early intervention and language development in children who are deaf and hard of hearing. Pediatrics.2000;106(3) :e43 . Available at: www.pediatrics.org/cgi/content/full/106/3/e43
# 3. Calderon R. Parental involvement in deaf children's education programs as a predictor of child's language, early reading, and social-emotional development. J Deaf Stud Deaf Educ.2000;5 :140– 155
# 
# 
# Purpose of the study is to examine the relationship between highly qualified providers, age of enrollment in intervention, and speech and language outcomes at 4 years of age in a group of children enrolled in school programs that specialize in the development of listening and spoken language.
# 
# - Does age of enrollment in intervention impact articulation, vocabulary, and language scores? (age_int)
# - Does place of enrollment in intervention impact articulation, vocabulary, and language scores? (age and age_int)
# - Is there an interaction effect between the place and age?
# - What other factors contribute to the variance?
# - Parental involvement
# - Degree of hearing loss
# - Mother’s education level (mother_ed)
# - Prematurity (premature_weeks)
# - Gender (gender)
# - Language (prim_lang)
# - Number of children in the home (sib)
# - Age when first amplified (age_amp)
# - Additional disability (secondary_diagnosis, known_synd)

# In[1]:


# Import modules and set options
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)


# Import data

# In[2]:


lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)


# In[3]:


lsl_dr.head()


# Indicator for non-profound hearing loss

# In[4]:


lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan


# Indicator for first intervention outside OPTION

# In[5]:


lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan


# Indicator for high school graduation of mother

# In[6]:


lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None


# Joint commission benchmark

# In[7]:


lsl_dr.onset_1.unique()


# In[8]:


lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None


# Create age in years variable

# In[9]:


lsl_dr['age_years'] = lsl_dr.age/12.


# Retain only 4-year-olds

# In[10]:


lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]


# In[11]:


lsl_dr.shape


# In[12]:


lsl_dr.study_id.unique().shape


# ## Summary Statistics for Paper

# In[13]:


unique_students = lsl_dr.drop_duplicates(subset='study_id')


# In[14]:


unique_students.shape


# In[15]:


unique_students.age.describe()


# In[16]:


unique_students.male.mean()


# In[17]:


unique_students.sib.describe()


# In[18]:


(unique_students.degree_hl == 6).describe()


# In[19]:


((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()


# In[20]:


unique_students.mother_hs.describe()


# In[21]:


unique_students.synd_or_disab.describe()


# In[22]:


grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()

for k, v in grouped_age:
    v.hist(label=k, alpha=.75, ax=ax, grid=False)

ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');


# ## Receptive Language Test Score Model

# In[23]:


# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='receptive') &
                                     (lsl_dr.non_english==False)]

receptive_language_dataset.head()


# This is the size of the resulting dataset to be used in this analysis

# In[24]:


receptive_language_dataset.shape


# In[25]:


mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()


# In[26]:


receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')


# In[27]:


receptive_language_dataset = receptive_language_dataset.set_index('study_id')


# In[28]:


receptive_language_dataset['mean_score'] = mean_score


# Specify criterion to use: `jcih`, `ident_by_3`, `program_by_6`

# In[29]:


criterion_labels = {'jcih': 'JCIH',
                   'ident_by_3': 'Diagnosed by 3 months',
                   'program_by_6': 'Intervention by 6 months'}

criterion = 'jcih'


# Proportion of missing values for each variable

# In[30]:


covs = ['age_years', 'school', 'score', 'male', 'time',
        'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)


# Drop individuals with missing `age`, since there is <1% of them

# In[31]:


receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]


# In[32]:


receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[33]:


receptive_language_dataset.mother_ed.hist()


# Final analysis dataset size

# In[34]:


receptive_language_dataset.shape


# In[35]:


missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct


# In[36]:


receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')


# In[37]:


by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()


# In[38]:


for col in ['age', 'score', 'degree_hl', 'family_inv', 'age_int']:
    try:
        _ = by_age_amp_missing.boxplot(column=col)
    except:
        pass


# In[39]:


receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')


# In[40]:


by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()


# In[41]:


for col in ['age', 'score', 'degree_hl', 'family_inv', 'age_int']:
    _ = by_jcih_missing.boxplot(column=col)


# In[43]:


receptive_language_dataset.age_years.mean()


# In[57]:


from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values

n_covs = 9

def generate_model(dataset):
    
    # Extract data to local variables
    age_years = dataset.age_years.values
    school = dataset.school.values
    score = dataset.score.values
    non_english = dataset.non_english.astype(int).values
    male = dataset.male.values
    sib = dataset.sib.values
    synd_or_disab = dataset.synd_or_disab.values
    int_outside_option = dataset.int_outside_option.values
    synd_or_disab = dataset.synd_or_disab.values
    study_id = dataset.index.values
    ident_by_3 = dataset.ident_by_3
    program_by_6 = dataset.program_by_6
    
    # Transform some data
    age_std = age_years - age_years.mean()
    
    # Imputation of family involvement
    n_family_inv_cats = len(dataset.family_inv.unique())
    p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats), value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
    family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
    x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked, observed=True)
    
    # Imputation of hearing loss
#     p_crit = Beta("p_%s" % criterion, 1, 1, value=0.5)
#     crit_masked = masked_values(dataset[criterion].values, value=np.nan)
#     x_crit = Bernoulli('x_%s' % criterion, p_crit, value=crit_masked, observed=True)
    
    # Imputation of hearing loss
    p_hl = Beta("p_hl", 1, 1, value=0.5)
    hl_masked = masked_values(dataset.deg_hl_below6.values, value=np.nan)
    x_hl = Bernoulli('x_hl', p_hl, value=hl_masked, observed=True)
    
    # Imputation of maternal education
    p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
    mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
    x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
    
    # Imputation of previous disability
    p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
    synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
    x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked, observed=True)
    
    # Imputation of siblings
    n_sib_cats = len(dataset.sib.unique())
    p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
    sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
    x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
    
    # Imputation of time
#     n_time_cats = len(dataset.time.unique())
#     p_time = Dirichlet("p_time", np.ones(n_time_cats), value=[1./n_time_cats]*(n_time_cats-1))
#     time_masked = masked_values(dataset.time.fillna(0.5), value=0.5)
#     x_time = Categorical('x_time', p_time, value=time_masked, observed=True)
    
    mean_score = Normal("mean_score", 100, 0.001, value=80)
    
    # Indices to school random effects
    unique_schools = np.unique(school)
    school_index = [list(unique_schools).index(s) for s in school]

    # School random effect
    sigma_school = Uniform("sigma_school", 0, 1000, value=10)
    tau_school = sigma_school**-2
    alpha_school = Normal("alpha_school", mu=0, tau=tau_school, value=np.zeros(len(unique_schools)))
    
    # Indices to child random effects
    unique_child = np.unique(study_id)
    child_index = [list(unique_child).index(s) for s in study_id]
    
    # Robust individual random effect
    nu_child = Exponential("nu_child", 0.1, value=10)
    alpha_child = T("alpha_child", nu=nu_child, value=np.zeros(len(unique_child)))
    
    @deterministic
    def intercept(a0=mean_score, a1=alpha_school):
        """Random intercepts"""
        return a0 + a1[school_index]
    
    # Fixed effects
    beta = Normal("beta", 0, 0.001, value=[0]*n_covs)

    @deterministic
    def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib, 
              x_hl=x_hl, x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab, a=alpha_child):
        
        # Complete predictors
        X = [age_std, male, ident_by_3, program_by_6]
        # Partially-imputed predictors
        X += [x_family_inv, x_sib, x_hl, x_mother_hs, x_synd_or_disab]
        return b0 + np.dot(b, X) + a[child_index]
    
    sigma = Uniform("sigma", 0, 1000, value=10)
    tau = sigma**-2
    
    # Data likelihood
    score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
    #score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
    
    # Predictions:
    # Girl, average age, ident by 3 prog by 6, normal family involvement, 
    # 2 siblings, non-profound hearing loss,
    # mother with HS, no previous disability, JCIH
    x1 = [0, 0, 1, 1, 0, 2, 1, 1, 0]
    
    # Boy, 6 months younger than average, ident by 3 prog by 6, normal family involvement, 
    # ident by 3 prog by 6, no siblings, profound hearing loss,
    # mother without HS diploma, previous disability
    x2 = [-0.5, 1, 1, 1, 0, 0, 0, 0, 1]
    
    # Boy, 1.5 years older than average, not ident by 3 or prog by 6, poor family involvement, 
    # 1 sibling, non-profound hearing loss,
    # mother without HS diploma, previous disability
    x3 = [1.5, 1, 0, 0, 2, 1, 1, 0, 1]
    
    # Girl, 6 months older than average, ident by 3 prog by 6, impaired family involvement, 
    # 3 siblings, profound hearing loss,
    # mother with HS diploma, no previous disability
    x4 = [0.5, 1, 1, 1, 1, 3, 0, 1, 0]
    
    X = np.c_[x1, x2, x3, x4]

    theta_pred = Lambda('theta_pred', lambda b=beta, mu=mean_score: mu + np.dot(X.T, b))
    predictions = Normal('predictions', mu=theta_pred, tau=tau)
    
    
    return locals()


# In[58]:


M_reclang = MCMC(generate_model(receptive_language_dataset), 
                 db='sqlite', name='reclang', dbmode='w')


# In[59]:


iterations = 100000
burn = 90000


# In[60]:


M_reclang.sample(iterations, burn)


# In[65]:


labels = ['Age of Enrollment', 'Male', 'Identified by 3 mo.', 
          'In Program by 6 mo.', 'Reduced Family Involvement', 
          'Sibling Count', 'Non-profound Hearing Loss',
                       'Mother with HS Diploma','Additional Disability', ]
#                       criterion_labels[criterion]]


# In[66]:


Matplot.summary_plot([M_reclang.beta], rhat=False,
        custom_labels=labels, x_range=(-10,10), main='Receptive Language')


# In[49]:


Matplot.summary_plot([M_reclang.beta], rhat=False,
        custom_labels=labels, x_range=(-10,10), main='Receptive Language')


# In[63]:


M_reclang.beta.summary()


# In[64]:


Matplot.summary_plot([M_reclang.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)])


# ## Expressive Language Model

# In[67]:


expressive_language_dataset = lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='expressive') &
                                     (lsl_dr.non_english==False)]

expressive_language_dataset.head()


# In[68]:


mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score


# In[69]:


(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)


# In[70]:


expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull() 
                                                        & expressive_language_dataset.int_outside_option.notnull()]


# In[71]:


expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[72]:


M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')


# In[73]:


M_explang.sample(iterations, burn)


# In[74]:


M_explang.beta.summary()


# In[75]:


Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-10,10))


# ### Making predictions
# 
# Though these models were constructed primarily for inference, that is, to provide insights on the relative effects of a suite of factors on expected test scores, there is the potential for using them to predict speech-language outcomes. To explore this, we contrived four predictive scenarios, whereby children with different values of the set of predictor variables were specified, and predictions for them were generated by our models. 
# 
# The Bayesian hierarchical modeling approach makes generating predictions straightforward, via the posterior predictive distribution, which generates predicted values of outcomes conditional on the outcomes observed in the dataset. The variation of these predictions includes both the residual uncertainty in the model parameters and the aleatory sampling variability of the test outcomes. Our test cases were specified as follows:
# 
# **Case 1** 👧
# 
# * female
# * average age
# * normal family involvement
# * 2 siblings
# * non-profound hearing loss
# * mother with HS
# * no previous disability
# * JCIH criterion met
# 
# **Case 2** 👦
# 
# * male 
# * 6 months younger than average
# * normal family involvement
# * no siblings
# * profound hearing loss
# * mother without HS diploma
# * previous disability
# * JCIH criterion met
# 
# **Case 3** 👶
# 
# * male 
# * 1.5 years older than average
# * poor family involvement
# * 1 sibling
# * non-profound hearing loss
# * mother without HS diploma
# * previous disability
# * no JCIH
# 
# **Case 4** 🙎
# 
# * female 
# * 6 months older than average
# * impaired family involvement
# * 3 siblings
# * profound hearing loss
# * mother with HS diploma
# * no previous disability
# * JCIH criterion met

# In[76]:


M_reclang.predictions.summary()


# In[77]:


Matplot.summary_plot([M_reclang.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)])


# In[78]:


M_explang.predictions.summary()


# ## Articulation Model

# In[80]:


articulation_dataset = lsl_dr[(lsl_dr.domain=='Articulation') & 
                                     (lsl_dr.non_english==False)]
articulation_dataset.head()


# In[81]:


mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score


# In[82]:


(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)


# In[83]:


articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
                                            & articulation_dataset.int_outside_option.notnull()]


# In[84]:


articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[85]:


M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')


# In[86]:


M_articulation.sample(iterations, burn)


# In[87]:


Matplot.summary_plot([M_articulation.beta], 
        custom_labels=labels, rhat=False, main='Articulation', x_range=(-10,10))


# In[88]:


Matplot.summary_plot([M_articulation.predictions], rhat=False, custom_labels=['Student {}'.format(i) for i in range(1,5)])


# In[89]:


M_articulation.beta.summary()


# ## Expressive Vocabulary Model

# In[90]:


expressive_vocab_dataset = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & 
                                     (lsl_dr.non_english==False)]
expressive_vocab_dataset.head()


# In[91]:


mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score


# In[92]:


expressive_vocab_dataset[covs].isnull().sum()


# In[93]:


expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull() 
                                                    & expressive_vocab_dataset.male.notnull()
                                                    & expressive_vocab_dataset.int_outside_option.notnull()]


# In[94]:


expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[95]:


M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')


# In[96]:


M_expressive_vocab.sample(iterations, burn)


# In[97]:


Matplot.summary_plot([M_expressive_vocab.beta], 
        custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))


# In[98]:


M_expressive_vocab.beta.summary()


# In[99]:


Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects', 
                     x_range=(-25, 25))


# In[100]:


Matplot.summary_plot(M_expressive_vocab.alpha_school, custom_labels=[' ']*42, main='Expressive vocabulary school effects',
                     x_range=(-25, 25))


# ## Receptive Vocabulary Model

# In[101]:


receptive_vocab_dataset = lsl_dr[(lsl_dr.domain=='Receptive Vocabulary') & 
                                     (lsl_dr.non_english==False)]
receptive_vocab_dataset.head()


# In[102]:


mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score


# In[103]:


receptive_vocab_dataset[covs].isnull().sum()


# In[104]:


receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() & 
                                                  receptive_vocab_dataset.male.notnull() &
                                                  receptive_vocab_dataset.int_outside_option.notnull()]


# In[105]:


receptive_vocab_dataset.age_int.hist(bins=40)


# In[106]:


receptive_vocab_dataset.shape


# In[107]:


receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[108]:


M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')


# In[109]:


M_receptive_vocab.sample(iterations, burn)


# In[110]:


#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
        custom_labels=labels, main='Receptive Vocabulary', x_range=(-10,10))


# In[111]:


M_receptive_vocab.beta.summary()


# ## Database queries

# In[97]:


import sqlite3


# In[98]:


db_files = get_ipython().getoutput('ls *sqlite')


# In[99]:


articulation = sqlite3.connect("articulation.sqlite")


# In[100]:


c_artic = articulation.cursor()


# In[101]:


c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')


# In[102]:


sigma_school_artic = np.ravel(c_artic.fetchall())


# In[103]:


sigma_school_artic.mean()


# In[104]:


for dbname in db_files:
    db = sqlite3.connect(dbname)
    cur = db.cursor()
    cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
    trace = np.ravel(cur.fetchall())
    trace.sort()
    print('\n'+dbname)
    print(np.median(trace))
    print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])