# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
Import data
lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)
lsl_dr.head()
Indicator for non-profound hearing loss
lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan
Indicator for first intervention outside OPTION
lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan
Indicator for high school graduation of mother
lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None
Joint commission benchmark
lsl_dr.onset_1.unique()
lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None
Create age in years variable
lsl_dr['age_years'] = lsl_dr.age/12.
Retain only 4-year-olds
lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]
lsl_dr.shape
lsl_dr.study_id.unique().shape
unique_students = lsl_dr.drop_duplicates(subset='study_id')
unique_students.shape
unique_students.age.describe()
unique_students.male.mean()
unique_students.male.value_counts()
unique_students.sib.describe()
(unique_students.degree_hl == 6).describe()
unique_students.deg_hl_below6.isnull().sum()
unique_students.deg_hl_below6.value_counts()
((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()
unique_students.mother_hs.describe()
unique_students.synd_or_disab.describe()
unique_students.groupby('male').program_by_6.value_counts()
unique_students.groupby('sib').program_by_6.value_counts()
unique_students.groupby('deg_hl_below6').program_by_6.value_counts()
unique_students.groupby('mother_hs').program_by_6.value_counts()
unique_students.groupby('synd_or_disab').program_by_6.value_counts()
unique_students.groupby('ident_by_3').program_by_6.value_counts()
unique_students.groupby('jcih').program_by_6.value_counts()
grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()
for k, v in grouped_age:
v.hist(label=k, alpha=.75, ax=ax, grid=False)
ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');
students_data = lsl_dr.copy()
students_data['school_id'] = students_data.school.replace({int(s):i for i,s in enumerate(students_data.school.unique())})
students_data = students_data[students_data.domain=='Language']
students_data.shape
option_scores = students_data.drop_duplicates('study_id', take_last=True)[['score', 'school_id',
'deg_hl_below6', 'mother_hs']].dropna()
option_scores = option_scores.astype(int)
option_scores.to_csv('option_scores.csv', index=False)
Create technology indicators
unimodal = ~lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal']].sum(1).astype(bool)
lsl_dr['unimodal_ci'] = unimodal & lsl_dr.cochlear
lsl_dr['unimodal_ha'] = unimodal & lsl_dr.hearing_aid
lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(0)
Since bilateral hearing aid is the most prevalent category, it will be used as the baseline category.
tech_index = lsl_dr[[u'bilateral_ci',
u'bilateral_ha',
u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(1).astype(bool)
technology_subset = lsl_dr[tech_index]
technology_subset.score.max()
((unique_students.cochlear>0) & (unique_students.age_amp.notnull())).sum()
# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = technology_subset[(technology_subset.domain=='Language')
& (technology_subset.test_type=='receptive')
& (technology_subset.non_english==False)]
receptive_language_dataset.head()
This is the size of the resulting dataset to be used in this analysis
receptive_language_dataset.shape
mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()
receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')
receptive_language_dataset = receptive_language_dataset.set_index('study_id')
receptive_language_dataset['mean_score'] = mean_score
Specify criterion to use: jcih
, ident_by_3
, program_by_6
criterion_labels = {'jcih': 'JCIH',
'ident_by_3': 'Diagnosed by 3 months',
'program_by_6': 'Intervention by 6 months'}
criterion = 'jcih'
Proportion of missing values for each variable
covs = ['age_years', 'school', 'score', 'male', 'time',
'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)
Drop individuals with missing age
, since there is <1% of them
receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]
receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
receptive_language_dataset.mother_ed.hist()
Final analysis dataset size
receptive_language_dataset.shape
missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct
receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')
by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')
by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
'non_english'].mean()
receptive_language_dataset.age_years.mean()
from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values
n_covs = 11
def generate_model(dataset):
# Extract data to local variables
age_years = dataset.age_years.values
school = dataset.school.values
score = dataset.score.values
non_english = dataset.non_english.astype(int).values
male = dataset.male.values
sib = dataset.sib.values
synd_or_disab = dataset.synd_or_disab.values
int_outside_option = dataset.int_outside_option.values
synd_or_disab = dataset.synd_or_disab.values
study_id = dataset.index.values
ident_by_3 = dataset.ident_by_3
program_by_6 = dataset.program_by_6
bilateral_ci = dataset.bilateral_ci
bimodal = dataset.bimodal
unimodal_ci = dataset.unimodal_ci
unimodal_ha = dataset.unimodal_ha
# Transform some data
age_std = age_years - age_years.mean()
# Imputation of family involvement
n_family_inv_cats = len(dataset.family_inv.unique())
p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats),
value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked,
observed=True)
# Imputation of maternal education
p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
# Imputation of previous disability
p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked,
observed=True)
# Imputation of siblings
n_sib_cats = len(dataset.sib.unique())
p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
mean_score = Normal("mean_score", 100, 0.001, value=80)
# Indices to school random effects
unique_schools = np.unique(school)
school_index = [list(unique_schools).index(s) for s in school]
# School random effect
sigma_school = Uniform("sigma_school", 0, 1000, value=10)
tau_school = sigma_school**-2
alpha_school = Normal("alpha_school", mu=0, tau=tau_school,
value=np.zeros(len(unique_schools)))
@deterministic
def intercept(a0=mean_score, a1=alpha_school):
"""Random intercepts"""
return a0 + a1[school_index]
# Fixed effects
beta = Normal("beta", 0, 0.001, value=[0]*n_covs)
@deterministic
def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib,
x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab):
# Complete predictors
X = [male, ident_by_3, program_by_6, bilateral_ci, bimodal, unimodal_ci, unimodal_ha]
# Partially-imputed predictors
X += [x_family_inv, x_sib, x_mother_hs, x_synd_or_disab]
return b0 + np.dot(b, X) #+ a[child_index]
sigma = Uniform("sigma", 0, 1000, value=10)
tau = sigma**-2
# Data likelihood
score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
#score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
return locals()
M_reclang = MCMC(generate_model(receptive_language_dataset),
db='sqlite', name='reclang', dbmode='w')
iterations = 100000
burn = 90000
M_reclang.sample(iterations, burn)
labels = ['Male', 'Identified by 3 mo.',
'In Program by 6 mo.',
'Bilateral CI', 'Bimodal', 'Unimodal CI', 'Unimodal HA',
'Reduced Family Involvement',
'Sibling Count', 'Non-profound Hearing Loss',
'Mother with HS Diploma','Additional Disability']
# criterion_labels[criterion]]
Matplot.summary_plot([M_reclang.beta], rhat=False,
custom_labels=labels, x_range=(-15,15), main='Receptive Language')
M_reclang.beta.summary()
expressive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='expressive') &
(technology_subset.non_english==False)]
expressive_language_dataset.head()
mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score
(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)
expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull()
& expressive_language_dataset.int_outside_option.notnull()]
expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')
M_explang.sample(iterations, burn)
M_explang.beta.summary()
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-15,15))
articulation_dataset = technology_subset[(technology_subset.domain=='Articulation') &
(technology_subset.non_english==False)]
articulation_dataset.head()
mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score
(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)
articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
& articulation_dataset.int_outside_option.notnull()]
articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')
M_articulation.sample(iterations, burn)
Matplot.summary_plot([M_articulation.beta],
custom_labels=labels, rhat=False, main='Articulation', x_range=(-15,15))
M_articulation.beta.summary()
expressive_vocab_dataset = technology_subset[(technology_subset.domain=='Expressive Vocabulary') &
(technology_subset.non_english==False)]
expressive_vocab_dataset.head()
mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score
expressive_vocab_dataset[covs].isnull().sum()
expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull()
& expressive_vocab_dataset.male.notnull()
& expressive_vocab_dataset.int_outside_option.notnull()]
expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')
M_expressive_vocab.sample(iterations, burn)
Matplot.summary_plot([M_expressive_vocab.beta],
custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-15,15))
Matplot.summary_plot([M_expressive_vocab.beta],
custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))
M_expressive_vocab.beta.summary()
Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects',
x_range=(-25, 25))
receptive_vocab_dataset = technology_subset[(technology_subset.domain=='Receptive Vocabulary') &
(technology_subset.non_english==False)]
receptive_vocab_dataset.head()
mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score
receptive_vocab_dataset[covs].isnull().sum()
receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() &
receptive_vocab_dataset.male.notnull() &
receptive_vocab_dataset.int_outside_option.notnull()]
receptive_vocab_dataset.age_int.hist(bins=40)
receptive_vocab_dataset.shape
receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')
M_receptive_vocab.sample(iterations, burn)
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
custom_labels=labels, main='Receptive Vocabulary', x_range=(-15,15))
M_receptive_vocab.beta.summary()
import sqlite3
db_files = !ls *sqlite
articulation = sqlite3.connect("articulation.sqlite")
c_artic = articulation.cursor()
c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')
sigma_school_artic = np.ravel(c_artic.fetchall())
sigma_school_artic.mean()
for dbname in db_files:
db = sqlite3.connect(dbname)
cur = db.cursor()
cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
trace = np.ravel(cur.fetchall())
trace.sort()
print('\n'+dbname)
print(np.median(trace))
print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])