Technology Effects Analysis

In [3]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)

Import data

In [4]:
lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (31,33,41,74) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
In [5]:
lsl_dr.head()
Out[5]:
          redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
0  initial_assessment_arm_1     2005-2006   0     1      0          0    3   
1  initial_assessment_arm_1     2005-2006   0     1      0          0    3   
2  initial_assessment_arm_1     2005-2006   0     1      0          0    3   
3  initial_assessment_arm_1     2005-2006   0     1      0          0    3   
4  initial_assessment_arm_1     2010-2011   0     0      7          0    2   

   _mother_ed  father_ed  premature_age         ...           known_synd  \
0           2          2              8         ...                    0   
1           2          2              8         ...                    0   
2           2          2              8         ...                    0   
3           2          2              8         ...                    0   
4           6          6              7         ...                    0   

   synd_or_disab  race  age_test                 domain  school  score  \
0            NaN     0        48           Articulation     521     41   
1            NaN     0        48  Expressive Vocabulary     521     71   
2            NaN     0        48               Language     521     56   
3            NaN     0        48               Language     521     55   
4              0   NaN       NaN                    NaN     NaN    NaN   

   test_name   test_type  academic_year_start  
0        NaN     Goldman                 2005  
1        NaN         EVT                 2005  
2        PLS   receptive                 2005  
3        PLS  expressive                 2005  
4        NaN         NaN                 2010  

[5 rows x 74 columns]

Indicator for non-profound hearing loss

In [6]:
lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan

Indicator for first intervention outside OPTION

In [7]:
lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan

Indicator for high school graduation of mother

In [8]:
lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None

Joint commission benchmark

In [9]:
lsl_dr.onset_1.unique()
Out[9]:
array([  22. ,   18. ,   17. ,    3. ,    5. ,    nan,    2. ,    1. ,
         13. ,   10. ,   36. ,   28. ,   26. ,   21. ,   41. ,    0. ,
          6. ,   25. ,    7. ,   35. ,   34. ,   27. ,   15. ,   24. ,
          4. ,   14. ,   40. ,   42. ,   12. ,    9. ,   50. ,   33. ,
          8. ,   23. ,   32. ,   60. ,   31. ,   11. ,   30. ,   20. ,
         67. ,   49. ,   48. ,    1.5,   19. ,    2.5,   39. ,   52. ,
         16. ,   38. ,   29. ,   51. ,   46. ,   45. ,   54. ,   44. ,
         88. ,   65. ,   81. ,  116. ,   72. ,   57. ,   62. ,   43. ,
         78. ,   83. ,   61. ,  107. ,   64. ,   74. ,   37. ,   77. ,
         96. ,   97. ,   79. ,   47. ,   53. ,   59. ,   84. ,   95. ,
         80. ,    0.5,   58. ,   56. ,   86. ,   98. ,   85. ,   75. ,
        119. ,   66. ,   70. ,   63. ,  140. ,  126. ,  133. ,  103. ,
         87. ,   76. ,   55. ,   68. ,   92. ,   71. ,  154. ,   89. ,
        152. ])
In [10]:
lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None

Create age in years variable

In [11]:
lsl_dr['age_years'] = lsl_dr.age/12.

Retain only 4-year-olds

In [12]:
lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]
In [13]:
lsl_dr.shape
Out[13]:
(6873, 81)
In [14]:
lsl_dr.study_id.unique().shape
Out[14]:
(1613,)

Summary Statistics for Paper

In [15]:
unique_students = lsl_dr.drop_duplicates(subset='study_id')
In [16]:
unique_students.shape
Out[16]:
(1613, 81)
In [17]:
unique_students.age.describe()
Out[17]:
count    1593.000000
mean       29.700565
std        15.676539
min         1.000000
25%        18.000000
50%        32.000000
75%        41.000000
max        68.000000
Name: age, dtype: float64
In [18]:
unique_students.male.mean()
Out[18]:
0.5236612702366127
In [19]:
unique_students.male.value_counts()
Out[19]:
1    841
0    765
dtype: int64
In [20]:
unique_students.sib.describe()
Out[20]:
count    1501.000000
mean        1.161226
std         0.928434
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max         3.000000
Name: sib, dtype: float64
In [21]:
(unique_students.degree_hl == 6).describe()
Out[21]:
count         1613
mean     0.4575325
std      0.4983478
min          False
25%              0
50%              0
75%              1
max           True
Name: degree_hl, dtype: object
In [22]:
unique_students.deg_hl_below6.isnull().sum()
Out[22]:
220
In [23]:
unique_students.deg_hl_below6.value_counts()
Out[23]:
0    738
1    655
dtype: int64
In [24]:
((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()
Out[24]:
count         1613
mean     0.3787973
std       0.485238
min          False
25%              0
50%              0
75%              1
max           True
Name: degree_hl, dtype: object
In [25]:
unique_students.mother_hs.describe()
Out[25]:
count    1006.000000
mean        0.580517
std         0.493720
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: mother_hs, dtype: float64
In [26]:
unique_students.synd_or_disab.describe()
Out[26]:
count    1413.000000
mean        0.213022
std         0.409588
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: synd_or_disab, dtype: float64
In [27]:
unique_students.groupby('male').program_by_6.value_counts()
Out[27]:
male       
0     False    670
      True      95
1     False    750
      True      91
dtype: int64
In [28]:
unique_students.groupby('sib').program_by_6.value_counts()
Out[28]:
sib       
0    False    346
     True      38
1    False    562
     True      90
2    False    267
     True      37
3    False    145
     True      16
dtype: int64
In [29]:
unique_students.groupby('deg_hl_below6').program_by_6.value_counts()
Out[29]:
deg_hl_below6       
0              False    638
               True     100
1              False    574
               True      81
dtype: int64
In [30]:
unique_students.groupby('mother_hs').program_by_6.value_counts()
Out[30]:
mother_hs       
0          False    367
           True      55
1          False    489
           True      95
dtype: int64
In [31]:
unique_students.groupby('synd_or_disab').program_by_6.value_counts()
Out[31]:
synd_or_disab       
0              False    976
               True     136
1              False    273
               True      28
dtype: int64
In [32]:
unique_students.groupby('ident_by_3').program_by_6.value_counts()
Out[32]:
ident_by_3       
False       False    1013
            True       34
True        False     414
            True      152
dtype: int64
In [33]:
unique_students.groupby('jcih').program_by_6.value_counts()
Out[33]:
jcih       
0     False    928
      True      17
1     True     152
dtype: int64
In [34]:
grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()

for k, v in grouped_age:
    v.hist(label=k, alpha=.75, ax=ax, grid=False)

ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');
In [35]:
students_data = lsl_dr.copy()
In [36]:
students_data['school_id'] = students_data.school.replace({int(s):i for i,s in enumerate(students_data.school.unique())})
In [37]:
students_data = students_data[students_data.domain=='Language']
In [38]:
students_data.shape
Out[38]:
(2594, 82)
In [39]:
option_scores = students_data.drop_duplicates('study_id', take_last=True)[['score', 'school_id', 
                                                                           'deg_hl_below6', 'mother_hs']].dropna()
In [40]:
option_scores = option_scores.astype(int)
In [41]:
option_scores.to_csv('option_scores.csv', index=False)

Create technology indicators

In [42]:
unimodal = ~lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal']].sum(1).astype(bool)
In [43]:
lsl_dr['unimodal_ci'] = unimodal & lsl_dr.cochlear
lsl_dr['unimodal_ha'] = unimodal & lsl_dr.hearing_aid
In [44]:
lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(0)
Out[44]:
bilateral_ci    2136
bilateral_ha    2261
bimodal          791
unimodal_ci      311
unimodal_ha      320
dtype: int64

Since bilateral hearing aid is the most prevalent category, it will be used as the baseline category.

In [66]:
tech_index = lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(1).astype(bool)

technology_subset = lsl_dr[tech_index]

Receptive Language Test Score Model

In [67]:
# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = technology_subset[(technology_subset.domain=='Language') 
                                    & (technology_subset.test_type=='receptive') 
                                    & (technology_subset.non_english==False)]

receptive_language_dataset.head()
Out[67]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
2    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
187  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
210  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
220  initial_assessment_arm_1     2012-2013   0     1      0          0    2   
253  initial_assessment_arm_1     2011-2012   0     1      0          0    2   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
2             2          2              8     ...                      2005   
187           6          6              8     ...                      2014   
210           5          6              7     ...                      2014   
220           4          6              7     ...                      2012   
253           6          6              8     ...                      2011   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
2                0                   1          0       False         False   
187              1                   1        NaN       False         False   
210              1                   1          1       False         False   
220              1                   1          1       False         False   
253              0                   1        NaN       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
2       0   4.500000        False        False  
187     0   3.583333        False        False  
210     0   4.083333        False        False  
220     0   4.666667         True        False  
253     0   4.416667        False        False  

[5 rows x 83 columns]

This is the size of the resulting dataset to be used in this analysis

In [68]:
receptive_language_dataset.shape
Out[68]:
(941, 83)
In [69]:
mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()
In [70]:
receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')
In [71]:
receptive_language_dataset = receptive_language_dataset.set_index('study_id')
In [72]:
receptive_language_dataset['mean_score'] = mean_score

Specify criterion to use: jcih, ident_by_3, program_by_6

In [73]:
criterion_labels = {'jcih': 'JCIH',
                   'ident_by_3': 'Diagnosed by 3 months',
                   'program_by_6': 'Intervention by 6 months'}

criterion = 'jcih'

Proportion of missing values for each variable

In [74]:
covs = ['age_years', 'school', 'score', 'male', 'time',
        'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)
Out[74]:
age_years        0.00
school           0.00
score            0.00
male             0.00
time             0.04
family_inv       0.16
sib              0.06
deg_hl_below6    0.01
mother_hs        0.33
synd_or_disab    0.14
jcih             0.20
dtype: float64

Drop individuals with missing age, since there is <1% of them

In [75]:
receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]
In [76]:
receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [77]:
receptive_language_dataset.mother_ed.hist()
Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f2cfe80>

Final analysis dataset size

In [78]:
receptive_language_dataset.shape
Out[78]:
(871, 83)
In [79]:
missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct
Out[79]:
mother_hs        33.3
jcih             19.3
family_inv       15.8
synd_or_disab    14.0
sib               5.5
time              3.6
deg_hl_below6     1.3
male              0.0
score             0.0
school            0.0
age_years         0.0
dtype: float64
In [80]:
receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')
In [81]:
by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()
Out[81]:
                     male  synd_or_disab  deg_hl_below6  mother_hs
age_amp_missing                                                   
False            0.510885       0.221106       0.469298   0.616034
True             0.478022       0.250000       0.386364   0.654206
In [82]:
receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')
In [83]:
by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()
Out[83]:
                  male  synd_or_disab  deg_hl_below6  mother_hs
jcih_missing                                                   
False         0.513514       0.219110       0.467811   0.626033
True          0.464286       0.260563       0.385093   0.608247
In [84]:
receptive_language_dataset.age_years.mean()
Out[84]:
2.2484691924990403
In [85]:
from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values

n_covs = 12

def generate_model(dataset):
    
    # Extract data to local variables
    age_years = dataset.age_years.values
    school = dataset.school.values
    score = dataset.score.values
    non_english = dataset.non_english.astype(int).values
    male = dataset.male.values
    sib = dataset.sib.values
    synd_or_disab = dataset.synd_or_disab.values
    int_outside_option = dataset.int_outside_option.values
    synd_or_disab = dataset.synd_or_disab.values
    study_id = dataset.index.values
    ident_by_3 = dataset.ident_by_3
    program_by_6 = dataset.program_by_6
    bilateral_ci = dataset.bilateral_ci
    bimodal = dataset.bimodal
    unimodal_ci = dataset.unimodal_ci
    unimodal_ha = dataset.unimodal_ha
    
    # Transform some data
    age_std = age_years - age_years.mean()
    
    # Imputation of family involvement
    n_family_inv_cats = len(dataset.family_inv.unique())
    p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats), 
                             value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
    family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
    x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked, 
                               observed=True)
    
    # Imputation of hearing loss
    p_hl = Beta("p_hl", 1, 1, value=0.5)
    hl_masked = masked_values(dataset.deg_hl_below6.values, value=np.nan)
    x_hl = Bernoulli('x_hl', p_hl, value=hl_masked, observed=True)
    
    # Imputation of maternal education
    p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
    mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
    x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
    
    # Imputation of previous disability
    p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
    synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
    x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked, 
                                observed=True)
    
    # Imputation of siblings
    n_sib_cats = len(dataset.sib.unique())
    p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
    sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
    x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
    
    
    mean_score = Normal("mean_score", 100, 0.001, value=80)
    
    # Indices to school random effects
    unique_schools = np.unique(school)
    school_index = [list(unique_schools).index(s) for s in school]

    # School random effect
    sigma_school = Uniform("sigma_school", 0, 1000, value=10)
    tau_school = sigma_school**-2
    alpha_school = Normal("alpha_school", mu=0, tau=tau_school, 
                          value=np.zeros(len(unique_schools)))
    
    @deterministic
    def intercept(a0=mean_score, a1=alpha_school):
        """Random intercepts"""
        return a0 + a1[school_index]
    
    # Fixed effects
    beta = Normal("beta", 0, 0.001, value=[0]*n_covs)

    @deterministic
    def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib, 
              x_hl=x_hl, x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab):
        
        # Complete predictors
        X = [male, ident_by_3, program_by_6, bilateral_ci, bimodal, unimodal_ci, unimodal_ha]
        # Partially-imputed predictors
        X += [x_family_inv, x_sib, x_hl, x_mother_hs, x_synd_or_disab]
        return b0 + np.dot(b, X) #+ a[child_index]
    
    sigma = Uniform("sigma", 0, 1000, value=10)
    tau = sigma**-2
    
    # Data likelihood
    score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
    #score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
    
    
    return locals()
In [86]:
M_reclang = MCMC(generate_model(receptive_language_dataset), 
                 db='sqlite', name='reclang', dbmode='w')
In [87]:
iterations = 100000
burn = 90000
In [88]:
M_reclang.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1140.6 sec
In [94]:
labels = ['Male', 'Identified by 3 mo.', 
          'In Program by 6 mo.', 
          'Bilateral CI', 'Bimodal', 'Unimodal CI', 'Unimodal HA',
          'Reduced Family Involvement', 
          'Sibling Count', 'Non-profound Hearing Loss',
            'Mother with HS Diploma','Additional Disability']
#                       criterion_labels[criterion]]
In [95]:
Matplot.summary_plot([M_reclang.beta], rhat=False,
        custom_labels=labels, x_range=(-15,15), main='Receptive Language')
In [61]:
Matplot.summary_plot([M_reclang.beta], rhat=False,
        custom_labels=labels, x_range=(-10,10), main='Receptive Language')
In [62]:
M_reclang.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	-0.68            1.176            0.08             [-2.799  1.596]
	4.252            1.453            0.114            [ 1.429  7.149]
	7.113            2.032            0.173          [  3.463  11.173]
	-5.486           0.646            0.044            [-6.73  -4.247]
	-0.373           0.648            0.035            [-1.568  0.994]
	6.699            1.233            0.086            [ 4.563  9.32 ]
	5.317            1.432            0.117              [ 2.44  7.92]
	-6.645           1.238            0.086            [-9.174 -4.309]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-2.889           -1.506          -0.727         0.177         1.559
	1.461            3.244           4.191          5.207         7.248
	2.998            5.68            7.118          8.611         10.876
	-6.735           -5.94           -5.487         -5.046        -4.247
	-1.657           -0.813          -0.367         0.055         0.92
	4.527            5.905           6.608          7.481         9.303
	2.347            4.342           5.442          6.319         7.868
	-8.994           -7.506          -6.679         -5.81         -4.12
	

Expressive Language Model

In [98]:
expressive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='expressive') &
                                     (technology_subset.non_english==False)]

expressive_language_dataset.head()
Out[98]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
3    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
188  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
211  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
221  initial_assessment_arm_1     2012-2013   0     1      0          0    2   
254  initial_assessment_arm_1     2011-2012   0     1      0          0    2   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
3             2          2              8     ...                      2005   
188           6          6              8     ...                      2014   
211           5          6              7     ...                      2014   
221           4          6              7     ...                      2012   
254           6          6              8     ...                      2011   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
3                0                   1          0       False         False   
188              1                   1        NaN       False         False   
211              1                   1          1       False         False   
221              1                   1          1       False         False   
254              0                   1        NaN       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
3       0   4.500000        False        False  
188     0   3.583333        False        False  
211     0   4.083333        False        False  
221     0   4.666667         True        False  
254     0   4.416667        False        False  

[5 rows x 83 columns]
In [99]:
mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score
In [100]:
(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)
Out[100]:
age_years        0.00
school           0.00
score            0.00
male             0.00
time             0.04
family_inv       0.16
sib              0.06
deg_hl_below6    0.01
mother_hs        0.34
synd_or_disab    0.15
jcih             0.20
dtype: float64
In [101]:
expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull() 
                                                        & expressive_language_dataset.int_outside_option.notnull()]
In [102]:
expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [103]:
M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')
In [104]:
M_explang.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1111.8 sec
In [71]:
M_explang.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	-1.001           1.226            0.09             [-3.454  1.158]
	5.961            1.332            0.096            [ 3.41   8.534]
	6.543            2.242            0.198          [  2.959  11.565]
	-5.016           0.669            0.044            [-6.33  -3.655]
	-0.745           0.69             0.041            [-2.149  0.568]
	7.448            1.205            0.085            [ 5.245  9.872]
	7.015            1.297            0.101            [ 4.598  9.655]
	-6.837           1.442            0.109            [-9.326 -3.917]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-3.349           -1.826          -0.976         -0.189        1.341
	3.423            5.013           5.938          6.874         8.551
	2.68             4.801           6.358          8.081         11.348
	-6.33            -5.485          -5.022         -4.579        -3.655
	-2.125           -1.212          -0.729         -0.273        0.609
	5.23             6.616           7.425          8.267         9.869
	4.459            6.145           7.04           7.88          9.585
	-9.64            -7.75           -6.82          -5.854        -4.071
	
In [134]:
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-15,15))
In [72]:
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-10,10))

Articulation Model

In [105]:
articulation_dataset = technology_subset[(technology_subset.domain=='Articulation') & 
                                     (technology_subset.non_english==False)]
articulation_dataset.head()
Out[105]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
0    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
184  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
207  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
286  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
412  initial_assessment_arm_1           NaN   0     0      0          0    1   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
0             2          2              8     ...                      2005   
184           6          6              8     ...                      2014   
207           5          6              7     ...                      2014   
286           6          6              8     ...                      2014   
412           6          6              9     ...                       NaN   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
0                0                   1          0       False         False   
184              1                   1        NaN       False         False   
207              1                   1          1       False         False   
286              0                   1        NaN        True         False   
412              0                   0        NaN       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
0       0   4.500000        False        False  
184     0   3.583333        False        False  
207     0   4.083333        False        False  
286     0   3.916667        False        False  
412   NaN   3.750000        False        False  

[5 rows x 83 columns]
In [106]:
mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score
In [107]:
(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)
Out[107]:
age_years        0.00
school           0.00
score            0.00
male             0.00
time             0.04
family_inv       0.17
sib              0.05
deg_hl_below6    0.01
mother_hs        0.32
synd_or_disab    0.14
jcih             0.21
dtype: float64
In [108]:
articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
                                            & articulation_dataset.int_outside_option.notnull()]
In [109]:
articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [110]:
M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')
In [111]:
M_articulation.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1003.0 sec
In [132]:
Matplot.summary_plot([M_articulation.beta], 
        custom_labels=labels, rhat=False, main='Articulation', x_range=(-15,15))
In [83]:
Matplot.summary_plot([M_articulation.beta], 
        custom_labels=labels, rhat=False, main='Articulation', x_range=(-10,10))
In [85]:
M_articulation.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	0.842            1.276            0.087            [-1.699  3.333]
	4.066            1.714            0.142            [ 0.955  7.342]
	3.812            2.447            0.214            [-0.984  8.727]
	-4.393           0.66             0.042            [-5.713 -3.201]
	0.048            0.77             0.047            [-1.486  1.531]
	5.537            1.507            0.116            [ 2.681  8.384]
	5.125            1.572            0.123            [ 2.074  8.148]
	-8.13            1.489            0.117          [-11.165  -5.307]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-1.565           -0.043          0.826          1.679         3.548
	0.955            2.834           3.962          5.29          7.373
	-1.285           2.392           3.897          5.403         8.56
	-5.698           -4.85           -4.384         -3.921        -3.179
	-1.532           -0.442          0.065          0.556         1.486
	2.69             4.476           5.591          6.588         8.406
	2.171            3.999           5.132          6.195         8.345
	-11.128          -9.092          -8.05          -7.171        -5.227
	

Expressive Vocabulary Model

In [112]:
expressive_vocab_dataset = technology_subset[(technology_subset.domain=='Expressive Vocabulary') & 
                                     (technology_subset.non_english==False)]
expressive_vocab_dataset.head()
Out[112]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
1    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
10   initial_assessment_arm_1     2007-2008   0     0      0          0    3   
185  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
208  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
228  initial_assessment_arm_1     2012-2013   0     0      0          0    2   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
1             2          2              8     ...                      2005   
10            4          4              8     ...                      2007   
185           6          6              8     ...                      2014   
208           5          6              7     ...                      2014   
228           6          6              8     ...                      2012   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
1                0                   1          0       False         False   
10               0                   1          1       False         False   
185              1                   1        NaN       False         False   
208              1                   1          1       False         False   
228              1                   1        NaN       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
1       0   4.500000        False        False  
10      0   4.583333        False        False  
185     0   3.583333        False        False  
208     0   4.083333        False        False  
228     0   4.000000        False        False  

[5 rows x 83 columns]
In [113]:
mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score
In [114]:
expressive_vocab_dataset[covs].isnull().sum()
Out[114]:
age_years          4
school             0
score              0
male               0
time              40
family_inv       153
sib               49
deg_hl_below6      8
mother_hs        312
synd_or_disab    124
jcih             200
dtype: int64
In [115]:
expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull() 
                                                    & expressive_vocab_dataset.male.notnull()
                                                    & expressive_vocab_dataset.int_outside_option.notnull()]
In [116]:
expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [117]:
M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')
In [118]:
M_expressive_vocab.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1128.2 sec
In [131]:
Matplot.summary_plot([M_expressive_vocab.beta], 
        custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-15,15))
In [93]:
Matplot.summary_plot([M_expressive_vocab.beta], 
        custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))
In [94]:
M_expressive_vocab.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	1.343            1.236            0.088            [-0.896  3.726]
	4.361            1.337            0.099            [ 1.641  6.673]
	6.004            2.078            0.182          [  2.042  10.622]
	-5.315           0.611            0.039            [-6.396 -3.969]
	-1.217           0.677            0.039            [-2.495  0.116]
	5.996            1.353            0.104            [ 3.464  8.742]
	5.991            1.717            0.15             [ 2.596  9.108]
	-5.925           1.275            0.093            [-8.425 -3.711]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-1.027           0.481           1.334          2.205         3.651
	1.677            3.446           4.394          5.334         6.802
	1.964            4.643           5.97           7.341         10.556
	-6.473           -5.72           -5.344         -4.942        -4.004
	-2.54            -1.672          -1.213         -0.765        0.083
	3.464            5.052           5.995          6.892         8.742
	2.632            4.831           6.057          7.19          9.215
	-8.301           -6.813          -5.973         -4.975        -3.486
	
In [95]:
Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects', 
                     x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
In [96]:
Matplot.summary_plot(M_expressive_vocab.alpha_school, custom_labels=[' ']*42, main='Expressive vocabulary school effects',
                     x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.

Receptive Vocabulary Model

In [119]:
receptive_vocab_dataset = technology_subset[(technology_subset.domain=='Receptive Vocabulary') & 
                                     (technology_subset.non_english==False)]
receptive_vocab_dataset.head()
Out[119]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
11   initial_assessment_arm_1     2007-2008   0     0      0          0    3   
186  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
209  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
229  initial_assessment_arm_1     2012-2013   0     0      0          0    2   
288  initial_assessment_arm_1     2014-2015   0     0      0          0    1   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
11            4          4              8     ...                      2007   
186           6          6              8     ...                      2014   
209           5          6              7     ...                      2014   
229           6          6              8     ...                      2012   
288           6          6              8     ...                      2014   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
11               0                   1          1       False         False   
186              1                   1        NaN       False         False   
209              1                   1          1       False         False   
229              1                   1        NaN       False         False   
288              0                   1        NaN        True         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
11      0   4.583333        False        False  
186     0   3.583333        False        False  
209     0   4.083333        False        False  
229     0   4.000000        False        False  
288     0   3.916667        False        False  

[5 rows x 83 columns]
In [120]:
mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score
In [121]:
receptive_vocab_dataset[covs].isnull().sum()
Out[121]:
age_years          4
school             0
score              0
male               0
time              37
family_inv       158
sib               49
deg_hl_below6     10
mother_hs        318
synd_or_disab    125
jcih             202
dtype: int64
In [122]:
receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() & 
                                                  receptive_vocab_dataset.male.notnull() &
                                                  receptive_vocab_dataset.int_outside_option.notnull()]
In [123]:
receptive_vocab_dataset.age_int.hist(bins=40)
Out[123]:
<matplotlib.axes._subplots.AxesSubplot at 0x10bc0dc18>
In [124]:
receptive_vocab_dataset.shape
Out[124]:
(922, 83)
In [125]:
receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [126]:
M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')
In [127]:
M_receptive_vocab.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1251.2 sec
In [129]:
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
        custom_labels=labels, main='Receptive Vocabulary', x_range=(-15,15))
In [106]:
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
        custom_labels=labels, main='Receptive Vocabulary', x_range=(-10,10))
In [107]:
M_receptive_vocab.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	0.309            1.085            0.076            [-1.857  2.387]
	3.26             1.406            0.113            [ 0.746  6.212]
	6.076            2.06             0.181          [  2.363  10.008]
	-4.999           0.563            0.035            [-6.112 -3.92 ]
	-1.181           0.578            0.031            [-2.359 -0.128]
	5.245            1.139            0.083            [ 2.797  7.29 ]
	4.356            1.361            0.108            [ 1.676  7.031]
	-6.92            1.236            0.088            [-9.163 -4.642]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-1.809           -0.41           0.28           1.029         2.451
	0.402            2.339           3.268          4.256         5.909
	2.378            4.543           5.985          7.562         10.045
	-6.064           -5.394          -5.019         -4.635        -3.857
	-2.344           -1.57           -1.165         -0.815        -0.078
	2.974            4.471           5.269          6.044         7.483
	1.475            3.504           4.359          5.265         6.956
	-9.03            -7.824          -7.002         -6.073        -4.409
	

Database queries

In [108]:
import sqlite3
In [109]:
db_files = !ls *sqlite
In [110]:
articulation = sqlite3.connect("articulation.sqlite")
In [111]:
c_artic = articulation.cursor()
In [112]:
c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')
Out[112]:
<sqlite3.Cursor at 0x1151881f0>
In [113]:
sigma_school_artic = np.ravel(c_artic.fetchall())
In [114]:
sigma_school_artic.mean()
Out[114]:
5.3893053284999999
In [115]:
for dbname in db_files:
    db = sqlite3.connect(dbname)
    cur = db.cursor()
    cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
    trace = np.ravel(cur.fetchall())
    trace.sort()
    print('\n'+dbname)
    print(np.median(trace))
    print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])
articulation.sqlite
5.468853
[ 2.593277  8.103165]

explang.sqlite
5.3118985
[ 3.703665  7.366459]

expressive_vocab.sqlite
8.231873
[  6.286766  11.080385]

receptive_vocab.sqlite
6.559168
[ 4.745076  9.006935]

reclang.sqlite
5.470786
[ 3.816387  7.789314]