#!/usr/bin/env python
# coding: utf-8

# # Factors that Influence Outcomes of the Five Domains of Speech-Language in Children with Hearing Loss at age 4 years
# 
# Paper 1

# In[1]:


# Import modules and set options
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import arviz as az

sns.set(context='notebook', style='ticks')


# Import data

# In[2]:


lsl_dr = (pd.read_csv('../data/clean/lsl_dr_frozen_09272019.csv', index_col=0, low_memory=False)
                  .rename({'onset_1':'identify_mo'}, axis=1))


# In[3]:


lsl_dr.head()


# In[5]:


lsl_dr.drop_duplicates('study_id').shape[0]


# Indicator for non-profound hearing loss

# In[6]:


lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan


# Indicator for first intervention outside OPTION

# In[7]:


lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan


# Indicator for high school graduation of mother

# In[8]:


lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None


# Create age in years variable

# In[9]:


lsl_dr['age_years'] = lsl_dr.age/12.


# Create school index

# In[10]:


schools_unique = np.sort(lsl_dr.school.unique())
school_lookup = dict(zip(schools_unique, range(len(schools_unique))))


# In[11]:


lsl_dr['school_idx'] = lsl_dr.school.replace(school_lookup)


# Create student index

# In[12]:


student_unique = np.sort(lsl_dr.study_id.unique())
student_lookup = dict(zip(student_unique, range(len(student_unique))))


# In[13]:


lsl_dr['student_idx'] = lsl_dr.study_id.replace(student_lookup)


# In[14]:


age_mask = (lsl_dr.age_test>=48) & (lsl_dr.age_test<60)
lsl_dr[age_mask].drop_duplicates('student_idx').shape


# ### Exclusions
# 
# Drop non-english and other disabilities, filter for hearing loss

# In[15]:


other_etiology = (lsl_dr[['etiology_3___2', 'etiology_3___4', 'etiology_3___5', 'etiology_3___6', 'etiology_3___9',
       'etiology_oth___1', 'etiology_oth___3', 'etiology_oth___4', 'etiology_oth___8', 'etiology_oth___9']]
                      .sum(1).astype(bool))


# Excluding kids with non-english as first language, kids with no hearing loss, and kids with additional cognitive concerns. We define this to be primary language is not zero; kids with no hearing loss are those with non-4 value for hearing loss type variable; cognitive concerns are those with `etiology_2` not 0 or 4, or if the value is missing, there are no `other_etiology` flags.

# In[16]:


inclusion_mask = (~lsl_dr.non_english.astype(bool) 
                  & ((lsl_dr.type_hl_ad!=4) & (lsl_dr.type_hl_as!=4))
                  & ((lsl_dr.etiology_2.isin([0,4])) | (lsl_dr.etiology_2.isnull() & ~other_etiology)))


# In[17]:


covariates = ['score', 'student_idx', 'school_idx', 'male', 'sib', 'family_inv', 'race', 
              'age_test', 'premature_weeks', 'age_amp', 'parent_hl',
              'domain', 'deg_hl_below6', 'mother_hs', 'mother_college', 'age_years', 'test_type', 
              'time', 'bilateral_ci', 'one_or_both_parent_hl',
              'bilateral_ha', 'unilateral_ci', 'unilateral_ha', 
              'bimodal', 'assymetrical', 'age_int', 'autism']


# In[18]:


hl_type_cols = lsl_dr.columns[lsl_dr.columns.str.contains('lateral')].values.tolist()


# In[19]:


covariates += list(set(hl_type_cols))
covariates = list(set(covariates))


# Number of children that do not meet each of the three inclusion criteria alone. Note that some students may meet more than one.

# In[20]:


lsl_dr[lsl_dr.non_english.astype(bool)].drop_duplicates('student_idx').shape[0]


# In[21]:


lsl_dr[~((lsl_dr.type_hl_ad!=4) & (lsl_dr.type_hl_as!=4))].drop_duplicates('student_idx').shape[0]


# In[22]:


lsl_dr[~(((lsl_dr.etiology_2.isin([0,4])) | (lsl_dr.etiology_2.isnull() & ~other_etiology)))].drop_duplicates('student_idx').shape[0]


# Apply inclusion mask to show total number of records.

# In[23]:


lsl_dr[lsl_dr.non_english.astype(bool)].drop_duplicates('student_idx').shape[0]


# In[24]:


lsl_dr[~((lsl_dr.type_hl_ad!=4) & (lsl_dr.type_hl_as!=4))].drop_duplicates('student_idx').shape[0]


# In[25]:


lsl_dr[~(((lsl_dr.etiology_2.isin([0,4])) | (lsl_dr.etiology_2.isnull() & ~other_etiology)))].drop_duplicates('student_idx').shape[0]


# In[39]:


lsl_dr[~((lsl_dr.age_test>=48) & (lsl_dr.age_test<60))].drop_duplicates('student_idx').shape[0]


# In[26]:


analysis_subset = lsl_dr.loc[inclusion_mask, covariates].copy().dropna(subset=['time'])
analysis_subset.shape


# Total unique students with inclusion criteria.

# In[27]:


analysis_subset.drop_duplicates('student_idx').shape


# In[28]:


analysis_subset[['bilateral_ci',
              'bilateral_ha', 'unilateral_ci', 'unilateral_ha', 'bimodal']].mean()


# In[29]:


analysis_subset['premature'] = (analysis_subset.premature_weeks>0).astype(float)
analysis_subset.loc[analysis_subset.premature_weeks.isnull(), 'premature'] = np.nan


# Add expressive and receptive to langauge test domains

# In[30]:


analysis_subset.loc[(analysis_subset.domain=='Language') & (analysis_subset.test_type=='receptive'), 
           'domain'] = 'Receptive Language'
analysis_subset.loc[(analysis_subset.domain=='Language') & (analysis_subset.test_type=='expressive'), 
           'domain'] = 'Expressive Language'


# In[31]:


analysis_subset[analysis_subset.age_test<100].age_test.hist()


# Drop records with missing race and age at ernollment, since there is less than 1% of them

# In[32]:


analysis_subset = analysis_subset.dropna(subset=['race', 'age_years', 'age_test', 'parent_hl'])


# In[33]:


analysis_subset.autism.sum()


# Number of unique 4-year-olds that match inclusion criteria.

# In[37]:


age_mask = (analysis_subset.age_test>=48) & (analysis_subset.age_test<60)
subset_4yo = analysis_subset[age_mask].drop_duplicates('student_idx')
subset_4yo.shape


# In[35]:


subset_4yo.isnull().sum().sort_values(ascending=False)


# In[84]:


analysis_subset.to_csv('../data/clean/analysis_subset.csv', index=False)


# In[2]:


analysis_subset = pd.read_csv('../data/clean/analysis_subset.csv')
analysis_subset.head()


# ## Covariate model specification
# 
# Student test scores were modeled across five domains: receptive language, expressive 
# language, articulation, receptive vocabulary, and expressive vocabulary. Each domain was modeled separately, using the same general model structure, a Bayesian hierarchical mixed-effects linear model.
# 
# Consistent with the goals of the analysis, namely evaluating speech and language outcomes at 4 years of age, the subset of the dataset analyzed was restricted to children between the ages 48 and 60 months at the time of testing. Due to this age restriction, most students in the dataset contributed only one test score to the analysis, but several individuals (on the order of 100-200, depending on the test) contributed two or more. Rather than address the repeated measures (and resulting lack of independence) statistically, the mean test score was used for each student with more than one reported score within each domain.
# 
# Eight potential predictor variables (covariates) were included in the model as fixed effects, based on a priori expert opinion that considered them to be possibly influential for predicting test scores, and on the presence of reasonable variation in the predictor variable across subjects; variables that had identical values over a large portion of subjects were excluded. The final subset of covariates included gender, family involvement index, number of siblings in household, degree of hearing loss, mother’s education, age of enrollment, and time in the program. The family involvement scale is a 5-category ordinal variable that ranges from ideal participation by family (0) to limited participation (4) (Moeller, 2000). Mother’s education was coded as a binary variable that identifie children whose mother ha at least a high school diploma as the highest level of completed education. The degree of hearing loss was coded as a binary variable that took the value 1 if the degree of loss (based on PTA or ABR results) was less than 6 (profound), resulting in a variable that indicate non-profound hearing loss. Since degree of loss was considered important a priori, we wished to include it in the model, but the lack of variation among non-profound scores (<6) necessitated recoding into fewer categories. These variables were included together in a multivariate mixed effects model so that the estimates of each covariate effect re adjusted for the effects of the others. For example, the effect of decreased family involvement s the estimated effect after taking into account the other variables in the model.
# 
# In addition to the fixed effects specified above, we included a random effect to account for the variation in test scores among schools, over and above the variation explained by our variables of interest. This random effect estimates the mean and variance of the population of schools comprising this multi-center study. Hence, the predicted score for any particular student is a combination of the fixed effect predictors and the random effect from his or her school. The residual variation from the model is assumed to be normally distributed. 
# 
# The LSL-DR dataset includes several covariates of interest that have observations missing for some records. In order to avoid conducting complete case analysis, we imputed missing values using Bayesian statistical methods, which allows us to use all available data in our model (REF). We included covariates where one third were missing for the subset of data corresponding to a particular test. Missing values were imputed in the model by constructing distributions of values based on the non-missing values, and sampling from this distribution at every iteration of the Markov Chain Monte Carlo (MCMC) sampling procedure used to estimate the model. This approach assumes that covariate values are 
# missing completely at random (MCAR). As an ad hoc measure for checking for obvious violations of this assumption, we calculated mean values for other variables according to whether each covariate was missing; if means differed strongly according to missingness, this would suggest that missingness may not be completely at random.
# 
# All models were estimated using Markov chain Monte Carlo (Brooks et al. 2011) methods. We specified vague priors to all unknown model parameters. Specifically, standard deviations of the random effects and the sampling distribution were given half-Cauchy priors with scale parameter set to 1, covariates were specified with zero-mean Gaussian priors with standard deviation 100, and the overall mean score also as Gaussian, but with a mean value of 100. Each model was run for 5000 iterations using the PyMC 3.5 software package (Salvatier et al. 2016), with 4000 iterations conservatively discarded as the burn-in interval, leaving 1000 for inference. Model convergence was checked by running a second MCMC chain, and calculating the Gelman-Rubin statistic (Gelman and Rubin 1992) using both chains. Model goodness-of-fit was evaluated using posterior predictive checks, by simulating mean outcome differences from the model and comparing the distribution of simulated differences to the observed mean difference. 

# Utility function for filling NA values

# In[3]:


def fillna(x, value):
    x_masked = np.ma.masked_invalid(x)
    np.ma.set_fill_value(x_masked, value)
    return x_masked

fillna(np.array([0.4, np.nan, 5]), 0.5)


# Age of amplification appears to be a mixture, so I will impute it as such

# In[4]:


np.log(analysis_subset.age_amp+0.1).hist(bins=20)


# In[5]:


(analysis_subset.age_amp/12).hist()


# In[6]:


from pymc3 import Bernoulli, Normal, Uniform, Dirichlet, Categorical, Beta, HalfCauchy
from pymc3 import Gamma, Exponential, Multinomial, HalfNormal, NormalMixture, Lognormal
from pymc3 import Model, Deterministic, Metropolis
from numpy.ma import masked_values, set_fill_value, masked_invalid
import theano.tensor as tt
from theano import shared

def generate_model(dataset, cohort_age, intervention=False):
    
    if cohort_age==2:
        mask = (dataset.age_test>=24) & (dataset.age_test<36)
    elif cohort_age==3:
        mask = (dataset.age_test>=36) & (dataset.age_test<48)
    elif cohort_age==4:
        mask = (dataset.age_test>=48) & (dataset.age_test<60)
    elif cohort_age==5:
        mask = (dataset.age_test>=60) & (dataset.age_test<72)
    elif cohort_age==6:
        mask = (dataset.age_test>=72) & (dataset.age_test<84)
    else:
        print('Invalid age!')
        return
    
    # Generate mean scores
    # mean_scores = dataset[mask].groupby('student_idx').score.mean()
    # Take the latest score
    cohort_dataset = dataset[mask]
    dataset_unique = cohort_dataset.loc[cohort_dataset.groupby('student_idx').age_test.idxmax()]

    assert not dataset_unique.score.isnull().sum()
    
    (male, sib, family_inv, race, school, premature, time, 
             non_severe, mother_college, age_amp,
             age_int, score) = dataset_unique[['male', 'sib','family_inv', 
                                                           'race', 'school_idx', 'premature',
                                                    'time', 'deg_hl_below6', 'mother_college', 
                                                    'age_amp', 
                                                    'age_int', 'score']].astype(float).T.values

    with Model() as model:
        
        # Imputation of age of amplification
        if np.isnan(age_amp).sum():
            m_age_amp = Normal("m_age_amp", 0, sd=5, shape=2)
            s_age_amp = Exponential("s_age_amp", 1)
            p_age_amp = Beta('p_age_amp', 1, 1)
            _x_age_amp = NormalMixture('x_age_amp', [p_age_amp, 1-p_age_amp], m_age_amp, 
                                       sd=s_age_amp,
                                       observed=masked_invalid(np.log(age_amp+0.1)))
            x_age_amp = (tt.exp(_x_age_amp) - 0.1) / 12
        else:
            x_age_amp = age_amp / 12
        
        # Imputation of family involvement
        if np.isnan(family_inv).sum():
            p_family_inv = Dirichlet("p_family_inv", np.ones(5))
            x_family_inv = Categorical('x_family_inv', p_family_inv, 
                                       observed=masked_invalid(family_inv))
        else:
            x_family_inv = family_inv
        
        # Age of intervention
        if intervention and np.isnan(age_int).sum():
            m_int = Normal('m_int', 2, sd=5)
            s_int = HalfNormal('s_int', sd=5)
            x_age_int = Lognormal('x_premature', m_int, sd=s_int, observed=masked_invalid(age_int))
        else:
            x_age_int = age_int

        # Imputation of siblings
        if np.isnan(sib).sum():
            n_sib_cats = len(dataset.sib.unique())
            p_sib = Dirichlet("p_sib", np.ones(n_sib_cats))
            x_sib = Categorical('x_sib', p_sib, observed=masked_invalid(sib))
        else:
            x_sib = sib
            
        # Imputation of mother's college education
        if np.isnan(mother_college).sum():
            p_mother_college = Beta("p_mother_college", 1, 1)
            x_mother_college = Bernoulli('x_mother_college', p_mother_college, observed=masked_invalid(mother_college))
        else:
            x_mother_college = mother_college
            
        
        # Imputation of non-severe hearing loss
        if np.isnan(non_severe).sum():
            p_non_severe = Beta("p_non_severe", 1, 1)
            x_non_severe = Bernoulli('x_non_severe', p_non_severe, observed=masked_invalid(non_severe))
        else:
            x_non_severe = non_severe
            
        # Indices to school random effects
        unique_schools = np.unique(school)
        school_index = [list(unique_schools).index(s) for s in school]

        # School random effect (non-centered parameterization)
        μ_school = Normal('μ_school', 90, sd=10)
        σ_school = Exponential("σ_school", 1)
        z_school = Normal('z_school', mu=0, sd=1, shape=len(unique_schools))
        α_school = Deterministic("α_school", μ_school + z_school*σ_school)
        
        # Random intercepts
        intercept = α_school[school_index]
        
        # Race effect
        β_race = Normal("β_race", 0, sd=10, shape=4)
        race_effect = tt.concatenate([[0], β_race])[race.astype(int)]
        
        # Covariates
        X = [x_age_amp, 
             x_family_inv,
             x_sib, 
             x_mother_college, 
             x_non_severe,
             time]
        
        if intervention:
            X += [x_age_int]

        # Fixed effects
        β = Normal("β", 0, sd=100, shape=len(X))
        θ = intercept + race_effect + β.dot(tt.stack(X))
    
        σ = HalfNormal("σ", sd=25, testval=100)
        score_like = Normal("score_like", mu=θ, sd=σ, observed=score)

    return model


# In[56]:


receptive_language_dataset = analysis_subset[(analysis_subset.domain=='Receptive Language')]


# In[57]:


receptive_language_4 = generate_model(receptive_language_dataset, 4)


# In[58]:


iterations = 1000
tuning = 9000


# In[59]:


from pymc3 import sample

with receptive_language_4:
    
    rec_lang_4_trace = sample(iterations, 
                              tune=tuning, chains=2, cores=2)


# In[60]:


labels = ['Age at amplification',
          'Family Involvement Score', 
          'Sibling Count',
          'Mother with College Ed',
          'Non-severe hearing loss',
          'Years in program']


# In[61]:


from pymc3 import traceplot, forestplot


# In[62]:


x_range = -10, 15


# In[63]:


_,axes = az.plot_forest(rec_lang_4_trace, 
               var_names=['β'],
              combined=True)
axes[0].set_title('Receptive Language')
axes[0].vlines(0, *axes[0].get_ylim(), linestyles='dotted')
axes[0].set_yticklabels(labels[::-1]);


# In[64]:


az.plot_forest(rec_lang_4_trace, var_names=['α_school'])


# In[65]:


from pymc3 import energyplot

energyplot(rec_lang_4_trace)


# In[66]:


_, axes = az.plot_forest(rec_lang_4_trace, var_names=['β_race'], combined=True)
axes[0].set_yticklabels(['Black', 'Hispanic', 'Asian', 'Other'][::-1]);


# The school random effect standard deviation is a measure of how variable scores are among schools. The estimated standard deviation is about 4 points for this domain.

# In[67]:


from pymc3 import traceplot

traceplot(rec_lang_4_trace, var_names=['σ_school']);


# In[68]:


traceplot(rec_lang_4_trace, var_names=['p_family_inv', 'p_sib', 'p_non_severe', 'p_mother_college', 'm_int']);


# In[69]:


az.summary(rec_lang_4_trace, var_names=['β']).set_index(pd.Index(labels))


# In[70]:


from pymc3 import sample_posterior_predictive

with receptive_language_4:
    
    rec_lang_4_pred = sample_posterior_predictive(rec_lang_4_trace, vars=[receptive_language_4.score_like])


# In[71]:


from scipy.stats import percentileofscore

plt.hist([np.round(percentileofscore(x, y)/100, 2) for x,y in zip(rec_lang_4_pred['score_like'], 
                                                        receptive_language_dataset.score)])


# ## Expressive Language Model

# In[72]:


expressive_language_dataset = analysis_subset[(analysis_subset.domain=='Expressive Language')]


# In[73]:


expressive_language_4 = generate_model(expressive_language_dataset, 4)


# In[74]:


with expressive_language_4:
    
    exp_lang_4_trace = sample(iterations, tune=tuning, cores=2)


# In[75]:


_,axes = az.plot_forest(exp_lang_4_trace, 
               var_names=['β'],
              combined=True)
axes[0].set_title('Expressive Language')
axes[0].vlines(0, *axes[0].get_ylim(), linestyles='dotted')
axes[0].set_yticklabels(labels[::-1]);


# In[76]:


energyplot(exp_lang_4_trace)


# In[77]:


az.summary(exp_lang_4_trace, var_names=['β']).set_index(pd.Index(labels))


# ## Articulation Model

# In[78]:


articulation_dataset = analysis_subset[(analysis_subset.domain=='Articulation')]


# In[79]:


articulation_4 = generate_model(articulation_dataset, 4)


# In[80]:


with articulation_4:
    
    artic_4_trace = sample(iterations, tune=tuning, cores=2)


# In[81]:


energyplot(artic_4_trace)


# In[82]:


_,axes = az.plot_forest(artic_4_trace, 
               var_names=['β'],
              combined=True)
axes[0].set_title('Articulation')
axes[0].vlines(0, *axes[0].get_ylim(), linestyles='dotted')
axes[0].set_yticklabels(labels[::-1]);


# In[83]:


az.summary(artic_4_trace, var_names=['β']).set_index(pd.Index(labels))


# ## Expressive Vocabulary Model

# In[84]:


expressive_vocab_dataset = analysis_subset[(analysis_subset.domain=='Expressive Vocabulary')]


# In[85]:


expressive_vocab_4 = generate_model(expressive_vocab_dataset, 4)


# In[86]:


with expressive_vocab_4:
    
    expressive_vocab_4_trace = sample(iterations, tune=tuning, cores=2)


# In[87]:


_,axes = az.plot_forest(expressive_vocab_4_trace, 
               var_names=['β'],
              combined=True)
axes[0].set_title('Expressive Vocabulary')
axes[0].vlines(0, *axes[0].get_ylim(), linestyles='dotted')
axes[0].set_yticklabels(labels[::-1]);


# In[88]:


energyplot(expressive_vocab_4_trace)


# In[89]:


az.summary(expressive_vocab_4_trace, var_names=['β']).set_index(pd.Index(labels))


# ## Receptive Vocabulary Model

# In[90]:


receptive_vocab_dataset = analysis_subset[(analysis_subset.domain=='Receptive Vocabulary')]


# In[91]:


receptive_vocab_4 = generate_model(receptive_vocab_dataset, 4)


# In[92]:


with receptive_vocab_4:
    
    receptive_vocab_4_trace = sample(iterations, tune=tuning, cores=2)


# In[93]:


_,axes = az.plot_forest(receptive_vocab_4_trace, 
               var_names=['β'],
              combined=True)
axes[0].set_title('Receptive Vocabulary')
axes[0].vlines(0, *axes[0].get_ylim(), linestyles='dotted')
axes[0].set_yticklabels(labels[::-1]);


# In[94]:


energyplot(receptive_vocab_4_trace)


# In[95]:


az.summary(receptive_vocab_4_trace, var_names=['β']).set_index(pd.Index(labels))


# In[96]:


with articulation_4:
    
    artic_4_pred = sample_posterior_predictive(artic_4_trace, vars=[articulation_4.score_like])


# In[97]:


from scipy.stats import percentileofscore

plt.hist([np.round(percentileofscore(x, y)/100, 2) for x,y in zip(artic_4_pred['score_like'], 
                                                        articulation_dataset.score)])


# ### Publication graphics

# In[98]:


import arviz as az

az.style.use('arviz-whitegrid')


model_names = ['Receptive Vocabulary', 'Expressive Vocabulary',
                                     'Receptive Language', 'Expressive Language',
                                     'Articulation']

_, axes = az.plot_forest([receptive_vocab_4_trace, expressive_vocab_4_trace,
                         rec_lang_4_trace, exp_lang_4_trace, artic_4_trace],
                         model_names=model_names,
                         combined=True,
                         var_names=['β'])
# axes[0].set_title('Estimated covariates for test domains')

# axes[0].set_xlim(-15, 15)

# axes[0].axvline(linestyle=":", linewidth=0.5, color='k')

# ticklabels = [labels[int((i-2)/5)] if not (i % 5 - 2) else '' for i,label in enumerate(axes[0].get_yticklabels())][::-1]
# axes[0].set_yticklabels(ticklabels)
# axes[0].tick_params(axis='both', which='major', labelsize=10)

# lines = axes[0].get_lines()[:6:]
# axes[0].legend(lines, model_names[::-1], loc='center right',  
#                bbox_to_anchor=(1.9, 0.5),
#           fancybox=True, fontsize=10);


# In[ ]: