#!/usr/bin/env python
# coding: utf-8

# # Technology Effects Analysis

# In[1]:


# Import modules and set options
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)


# Import data

# In[2]:


lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)


# In[3]:


lsl_dr.head()


# Indicator for non-profound hearing loss

# In[4]:


lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan


# Indicator for first intervention outside OPTION

# In[5]:


lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan


# Indicator for high school graduation of mother

# In[6]:


lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None


# Joint commission benchmark

# In[7]:


lsl_dr.onset_1.unique()


# In[8]:


lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None


# Create age in years variable

# In[9]:


lsl_dr['age_years'] = lsl_dr.age/12.


# Retain only 4-year-olds

# In[10]:


lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]


# In[11]:


lsl_dr.shape


# In[12]:


lsl_dr.study_id.unique().shape


# ## Summary Statistics for Paper

# In[13]:


unique_students = lsl_dr.drop_duplicates(subset='study_id')


# In[14]:


unique_students.shape


# In[15]:


unique_students.age.describe()


# In[16]:


unique_students.male.mean()


# In[17]:


unique_students.male.value_counts()


# In[18]:


unique_students.sib.describe()


# In[19]:


(unique_students.degree_hl == 6).describe()


# In[20]:


unique_students.deg_hl_below6.isnull().sum()


# In[21]:


unique_students.deg_hl_below6.value_counts()


# In[22]:


((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()


# In[23]:


unique_students.mother_hs.describe()


# In[24]:


unique_students.synd_or_disab.describe()


# In[25]:


unique_students.groupby('male').program_by_6.value_counts()


# In[26]:


unique_students.groupby('sib').program_by_6.value_counts()


# In[27]:


unique_students.groupby('deg_hl_below6').program_by_6.value_counts()


# In[28]:


unique_students.groupby('mother_hs').program_by_6.value_counts()


# In[29]:


unique_students.groupby('synd_or_disab').program_by_6.value_counts()


# In[30]:


unique_students.groupby('ident_by_3').program_by_6.value_counts()


# In[31]:


unique_students.groupby('jcih').program_by_6.value_counts()


# In[32]:


grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()

for k, v in grouped_age:
    v.hist(label=k, alpha=.75, ax=ax, grid=False)

ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');


# In[33]:


students_data = lsl_dr.copy()


# In[34]:


students_data['school_id'] = students_data.school.replace({int(s):i for i,s in enumerate(students_data.school.unique())})


# In[35]:


students_data = students_data[students_data.domain=='Language']


# In[36]:


students_data.shape


# In[37]:


option_scores = students_data.drop_duplicates('study_id', take_last=True)[['score', 'school_id', 
                                                                           'deg_hl_below6', 'mother_hs']].dropna()


# In[38]:


option_scores = option_scores.astype(int)


# In[39]:


option_scores.to_csv('option_scores.csv', index=False)


# Create technology indicators

# In[40]:


unimodal = ~lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal']].sum(1).astype(bool)


# In[41]:


lsl_dr['unimodal_ci'] = unimodal & lsl_dr.cochlear
lsl_dr['unimodal_ha'] = unimodal & lsl_dr.hearing_aid


# In[42]:


lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(0)


# Since bilateral hearing aid is the most prevalent category, it will be used as the baseline category.

# In[43]:


tech_index = lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(1).astype(bool)

technology_subset = lsl_dr[tech_index]


# In[44]:


technology_subset.score.max()


# In[45]:


((unique_students.cochlear>0) & (unique_students.age_amp.notnull())).sum()


# ## Receptive Language Test Score Model

# In[46]:


# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = technology_subset[(technology_subset.domain=='Language') 
                                    & (technology_subset.test_type=='receptive') 
                                    & (technology_subset.non_english==False)]

receptive_language_dataset.head()


# This is the size of the resulting dataset to be used in this analysis

# In[47]:


receptive_language_dataset.shape


# In[48]:


mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()


# In[49]:


receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')


# In[50]:


receptive_language_dataset = receptive_language_dataset.set_index('study_id')


# In[51]:


receptive_language_dataset['mean_score'] = mean_score


# Specify criterion to use: `jcih`, `ident_by_3`, `program_by_6`

# In[52]:


criterion_labels = {'jcih': 'JCIH',
                   'ident_by_3': 'Diagnosed by 3 months',
                   'program_by_6': 'Intervention by 6 months'}

criterion = 'jcih'


# Proportion of missing values for each variable

# In[53]:


covs = ['age_years', 'school', 'score', 'male', 'time',
        'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)


# Drop individuals with missing `age`, since there is <1% of them

# In[54]:


receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]


# In[55]:


receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[56]:


receptive_language_dataset.mother_ed.hist()


# Final analysis dataset size

# In[57]:


receptive_language_dataset.shape


# In[58]:


missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct


# In[59]:


receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')


# In[60]:


by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()


# In[61]:


receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')


# In[62]:


by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()


# In[63]:


receptive_language_dataset.age_years.mean()


# In[66]:


from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values

n_covs = 11

def generate_model(dataset):
    
    # Extract data to local variables
    age_years = dataset.age_years.values
    school = dataset.school.values
    score = dataset.score.values
    non_english = dataset.non_english.astype(int).values
    male = dataset.male.values
    sib = dataset.sib.values
    synd_or_disab = dataset.synd_or_disab.values
    int_outside_option = dataset.int_outside_option.values
    synd_or_disab = dataset.synd_or_disab.values
    study_id = dataset.index.values
    ident_by_3 = dataset.ident_by_3
    program_by_6 = dataset.program_by_6
    bilateral_ci = dataset.bilateral_ci
    bimodal = dataset.bimodal
    unimodal_ci = dataset.unimodal_ci
    unimodal_ha = dataset.unimodal_ha
    
    # Transform some data
    age_std = age_years - age_years.mean()
    
    # Imputation of family involvement
    n_family_inv_cats = len(dataset.family_inv.unique())
    p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats), 
                             value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
    family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
    x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked, 
                               observed=True)
    
    # Imputation of maternal education
    p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
    mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
    x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
    
    # Imputation of previous disability
    p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
    synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
    x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked, 
                                observed=True)
    
    # Imputation of siblings
    n_sib_cats = len(dataset.sib.unique())
    p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
    sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
    x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
    
    
    mean_score = Normal("mean_score", 100, 0.001, value=80)
    
    # Indices to school random effects
    unique_schools = np.unique(school)
    school_index = [list(unique_schools).index(s) for s in school]

    # School random effect
    sigma_school = Uniform("sigma_school", 0, 1000, value=10)
    tau_school = sigma_school**-2
    alpha_school = Normal("alpha_school", mu=0, tau=tau_school, 
                          value=np.zeros(len(unique_schools)))
    
    @deterministic
    def intercept(a0=mean_score, a1=alpha_school):
        """Random intercepts"""
        return a0 + a1[school_index]
    
    # Fixed effects
    beta = Normal("beta", 0, 0.001, value=[0]*n_covs)

    @deterministic
    def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib, 
              x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab):
        
        # Complete predictors
        X = [male, ident_by_3, program_by_6, bilateral_ci, bimodal, unimodal_ci, unimodal_ha]
        # Partially-imputed predictors
        X += [x_family_inv, x_sib, x_mother_hs, x_synd_or_disab]
        return b0 + np.dot(b, X) #+ a[child_index]
    
    sigma = Uniform("sigma", 0, 1000, value=10)
    tau = sigma**-2
    
    # Data likelihood
    score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
    #score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
    
    
    return locals()


# In[67]:


M_reclang = MCMC(generate_model(receptive_language_dataset), 
                 db='sqlite', name='reclang', dbmode='w')


# In[68]:


iterations = 100000
burn = 90000


# In[69]:


M_reclang.sample(iterations, burn)


# In[70]:


labels = ['Male', 'Identified by 3 mo.', 
          'In Program by 6 mo.', 
          'Bilateral CI', 'Bimodal', 'Unimodal CI', 'Unimodal HA',
          'Reduced Family Involvement', 
          'Sibling Count', 'Non-profound Hearing Loss',
            'Mother with HS Diploma','Additional Disability']
#                       criterion_labels[criterion]]


# In[71]:


Matplot.summary_plot([M_reclang.beta], rhat=False,
        custom_labels=labels, x_range=(-15,15), main='Receptive Language')


# In[73]:


M_reclang.beta.summary()


# ## Expressive Language Model

# In[74]:


expressive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='expressive') &
                                     (technology_subset.non_english==False)]

expressive_language_dataset.head()


# In[75]:


mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score


# In[76]:


(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)


# In[77]:


expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull() 
                                                        & expressive_language_dataset.int_outside_option.notnull()]


# In[78]:


expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[79]:


M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')


# In[80]:


M_explang.sample(iterations, burn)


# In[81]:


M_explang.beta.summary()


# In[82]:


Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-15,15))


# ## Articulation Model

# In[84]:


articulation_dataset = technology_subset[(technology_subset.domain=='Articulation') & 
                                     (technology_subset.non_english==False)]
articulation_dataset.head()


# In[85]:


mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score


# In[86]:


(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)


# In[87]:


articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
                                            & articulation_dataset.int_outside_option.notnull()]


# In[88]:


articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[89]:


M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')


# In[90]:


M_articulation.sample(iterations, burn)


# In[91]:


Matplot.summary_plot([M_articulation.beta], 
        custom_labels=labels, rhat=False, main='Articulation', x_range=(-15,15))


# In[93]:


M_articulation.beta.summary()


# ## Expressive Vocabulary Model

# In[94]:


expressive_vocab_dataset = technology_subset[(technology_subset.domain=='Expressive Vocabulary') & 
                                     (technology_subset.non_english==False)]
expressive_vocab_dataset.head()


# In[95]:


mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score


# In[96]:


expressive_vocab_dataset[covs].isnull().sum()


# In[97]:


expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull() 
                                                    & expressive_vocab_dataset.male.notnull()
                                                    & expressive_vocab_dataset.int_outside_option.notnull()]


# In[98]:


expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[99]:


M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')


# In[100]:


M_expressive_vocab.sample(iterations, burn)


# In[101]:


Matplot.summary_plot([M_expressive_vocab.beta], 
        custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-15,15))


# In[102]:


Matplot.summary_plot([M_expressive_vocab.beta], 
        custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))


# In[103]:


M_expressive_vocab.beta.summary()


# In[104]:


Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects', 
                     x_range=(-25, 25))


# ## Receptive Vocabulary Model

# In[106]:


receptive_vocab_dataset = technology_subset[(technology_subset.domain=='Receptive Vocabulary') & 
                                     (technology_subset.non_english==False)]
receptive_vocab_dataset.head()


# In[107]:


mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score


# In[108]:


receptive_vocab_dataset[covs].isnull().sum()


# In[109]:


receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() & 
                                                  receptive_vocab_dataset.male.notnull() &
                                                  receptive_vocab_dataset.int_outside_option.notnull()]


# In[110]:


receptive_vocab_dataset.age_int.hist(bins=40)


# In[111]:


receptive_vocab_dataset.shape


# In[112]:


receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');


# In[113]:


M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')


# In[114]:


M_receptive_vocab.sample(iterations, burn)


# In[115]:


#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
        custom_labels=labels, main='Receptive Vocabulary', x_range=(-15,15))


# In[117]:


M_receptive_vocab.beta.summary()


# ## Database queries

# In[118]:


import sqlite3


# In[119]:


db_files = get_ipython().getoutput('ls *sqlite')


# In[120]:


articulation = sqlite3.connect("articulation.sqlite")


# In[121]:


c_artic = articulation.cursor()


# In[122]:


c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')


# In[123]:


sigma_school_artic = np.ravel(c_artic.fetchall())


# In[124]:


sigma_school_artic.mean()


# In[125]:


for dbname in db_files:
    db = sqlite3.connect(dbname)
    cur = db.cursor()
    cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
    trace = np.ravel(cur.fetchall())
    trace.sort()
    print('\n'+dbname)
    print(np.median(trace))
    print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])