Technology Effects Analysis

In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)

Import data

In [2]:
lsl_dr = pd.read_csv('lsl_dr.csv', index_col=0)
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (31,33,41) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
In [3]:
lsl_dr.head()
Out[3]:
          redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
0  initial_assessment_arm_1     2009-2010   0     0      0          0    0   
1  initial_assessment_arm_1     2010-2011   0     0      7          0    2   
2  initial_assessment_arm_1     2010-2011   0     0      7          0    2   
3  initial_assessment_arm_1     2009-2010   0     1      7          0    1   
4  initial_assessment_arm_1     2005-2006   0     1      0          0    3   

   _mother_ed  father_ed  premature_age         ...           known_synd  \
0           5          4              9         ...                    0   
1           6          6              7         ...                    0   
2           6          6              7         ...                    1   
3           6          6              8         ...                    0   
4           2          2              8         ...                    0   

   synd_or_disab  race  age_test        domain  school  score  test_name  \
0              1     0       NaN           NaN     NaN    NaN        NaN   
1              0   NaN       NaN           NaN     NaN    NaN        NaN   
2              1   NaN       NaN           NaN     NaN    NaN        NaN   
3              0   NaN       NaN           NaN     NaN    NaN        NaN   
4            NaN     0        48  Articulation     521     41        NaN   

   test_type  academic_year_start  
0        NaN                 2009  
1        NaN                 2010  
2        NaN                 2010  
3        NaN                 2009  
4    Goldman                 2005  

[5 rows x 74 columns]

Indicator for non-profound hearing loss

In [4]:
lsl_dr['deg_hl_below6'] = lsl_dr.degree_hl<6
lsl_dr.loc[lsl_dr.degree_hl.isnull(), 'deg_hl_below6'] = np.nan

Indicator for first intervention outside OPTION

In [5]:
lsl_dr['int_outside_option'] = lsl_dr.age > lsl_dr.age_int
lsl_dr.loc[lsl_dr.age < lsl_dr.age_int, 'int_outside_option'] = np.nan

Indicator for high school graduation of mother

In [6]:
lsl_dr['mother_hs'] = lsl_dr.mother_ed > 1
lsl_dr.loc[lsl_dr.mother_ed.isnull(), 'mother_hs'] = None

Joint commission benchmark

In [7]:
lsl_dr.onset_1.unique()
Out[7]:
array([   1. ,   18. ,   17. ,   22. ,    nan,    3. ,    2. ,    5. ,
         13. ,   36. ,   28. ,   21. ,   41. ,   26. ,    0. ,   67. ,
         20. ,   24. ,    4. ,   40. ,   60. ,   10. ,    6. ,   25. ,
          7. ,   27. ,   15. ,   35. ,   14. ,   42. ,   34. ,   12. ,
          9. ,   32. ,   50. ,    8. ,   33. ,   23. ,   11. ,   31. ,
         30. ,   49. ,   48. ,    1.5,   19. ,    2.5,   39. ,   52. ,
         16. ,   38. ,   29. ,   51. ,   46. ,   45. ,   54. ,   88. ,
         65. ,   44. ,   81. ,  116. ,   72. ,   57. ,   62. ,   43. ,
         78. ,   83. ,   61. ,  107. ,   64. ,   74. ,   37. ,   77. ,
         96. ,   97. ,   79. ,   47. ,   53. ,   59. ,   84. ,   95. ,
         80. ,    0.5,   58. ,   56. ,   86. ,   98. ,   85. ,   75. ,
        119. ,   66. ,   70. ,   63. ,  140. ,  126. ,  133. ,  103. ,
         87. ,   76. ,   55. ,   68. ,   92. ,   71. ,  154. ,   89. ,
        152. ])
In [8]:
lsl_dr['ident_by_3'] = ident_by_3 = lsl_dr.onset_1 <= 3
lsl_dr['program_by_6'] = program_by_6 = lsl_dr.age <= 6
lsl_dr['jcih'] = ident_by_3 & program_by_6
lsl_dr.loc[lsl_dr.onset_1.isnull() | lsl_dr.age.isnull(), 'jcih'] = None

Create age in years variable

In [9]:
lsl_dr['age_years'] = lsl_dr.age/12.

Retain only 4-year-olds

In [10]:
lsl_dr = lsl_dr[(lsl_dr.age_test>=48) & (lsl_dr.age_test<60)]
In [11]:
lsl_dr.shape
Out[11]:
(6873, 81)
In [12]:
lsl_dr.study_id.unique().shape
Out[12]:
(1613,)

Summary Statistics for Paper

In [13]:
unique_students = lsl_dr.drop_duplicates(subset='study_id')
In [14]:
unique_students.shape
Out[14]:
(1613, 81)
In [15]:
unique_students.age.describe()
Out[15]:
count    1593.000000
mean       29.700565
std        15.676539
min         1.000000
25%        18.000000
50%        32.000000
75%        41.000000
max        68.000000
Name: age, dtype: float64
In [16]:
unique_students.male.mean()
Out[16]:
0.5236612702366127
In [17]:
unique_students.male.value_counts()
Out[17]:
1    841
0    765
dtype: int64
In [18]:
unique_students.sib.describe()
Out[18]:
count    1501.000000
mean        1.161226
std         0.928434
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max         3.000000
Name: sib, dtype: float64
In [19]:
(unique_students.degree_hl == 6).describe()
Out[19]:
count         1613
mean     0.4575325
std      0.4983478
min          False
25%              0
50%              0
75%              1
max           True
Name: degree_hl, dtype: object
In [20]:
unique_students.deg_hl_below6.isnull().sum()
Out[20]:
220
In [21]:
unique_students.deg_hl_below6.value_counts()
Out[21]:
0    738
1    655
dtype: int64
In [22]:
((unique_students.degree_hl < 6) & (unique_students.degree_hl > 1)).describe()
Out[22]:
count         1613
mean     0.3787973
std       0.485238
min          False
25%              0
50%              0
75%              1
max           True
Name: degree_hl, dtype: object
In [23]:
unique_students.mother_hs.describe()
Out[23]:
count    1006.000000
mean        0.580517
std         0.493720
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: mother_hs, dtype: float64
In [24]:
unique_students.synd_or_disab.describe()
Out[24]:
count    1413.000000
mean        0.213022
std         0.409588
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: synd_or_disab, dtype: float64
In [25]:
unique_students.groupby('male').program_by_6.value_counts()
Out[25]:
male       
0     False    670
      True      95
1     False    750
      True      91
dtype: int64
In [26]:
unique_students.groupby('sib').program_by_6.value_counts()
Out[26]:
sib       
0    False    346
     True      38
1    False    562
     True      90
2    False    267
     True      37
3    False    145
     True      16
dtype: int64
In [27]:
unique_students.groupby('deg_hl_below6').program_by_6.value_counts()
Out[27]:
deg_hl_below6       
0              False    638
               True     100
1              False    574
               True      81
dtype: int64
In [28]:
unique_students.groupby('mother_hs').program_by_6.value_counts()
Out[28]:
mother_hs       
0          False    367
           True      55
1          False    489
           True      95
dtype: int64
In [29]:
unique_students.groupby('synd_or_disab').program_by_6.value_counts()
Out[29]:
synd_or_disab       
0              False    976
               True     136
1              False    273
               True      28
dtype: int64
In [30]:
unique_students.groupby('ident_by_3').program_by_6.value_counts()
Out[30]:
ident_by_3       
False       False    1013
            True       34
True        False     414
            True      152
dtype: int64
In [31]:
unique_students.groupby('jcih').program_by_6.value_counts()
Out[31]:
jcih       
0     False    928
      True      17
1     True     152
dtype: int64
In [32]:
grouped_age = unique_students.replace({'jcih':{1:'JCIH', 0:'non-JCIH'}}).groupby('jcih')['age_years']
fig, ax = plt.subplots()

for k, v in grouped_age:
    v.hist(label=k, alpha=.75, ax=ax, grid=False)

ax.legend()
plt.xlabel('Age (years)'); plt.ylabel('Frequency');
In [33]:
students_data = lsl_dr.copy()
In [34]:
students_data['school_id'] = students_data.school.replace({int(s):i for i,s in enumerate(students_data.school.unique())})
In [35]:
students_data = students_data[students_data.domain=='Language']
In [36]:
students_data.shape
Out[36]:
(2594, 82)
In [37]:
option_scores = students_data.drop_duplicates('study_id', take_last=True)[['score', 'school_id', 
                                                                           'deg_hl_below6', 'mother_hs']].dropna()
In [38]:
option_scores = option_scores.astype(int)
In [39]:
option_scores.to_csv('option_scores.csv', index=False)

Create technology indicators

In [40]:
unimodal = ~lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal']].sum(1).astype(bool)
In [41]:
lsl_dr['unimodal_ci'] = unimodal & lsl_dr.cochlear
lsl_dr['unimodal_ha'] = unimodal & lsl_dr.hearing_aid
In [42]:
lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(0)
Out[42]:
bilateral_ci    2136
bilateral_ha    2261
bimodal          791
unimodal_ci      311
unimodal_ha      320
dtype: int64

Since bilateral hearing aid is the most prevalent category, it will be used as the baseline category.

In [43]:
tech_index = lsl_dr[[u'bilateral_ci',
        u'bilateral_ha', 
        u'bimodal', 'unimodal_ci', 'unimodal_ha']].sum(1).astype(bool)

technology_subset = lsl_dr[tech_index]
In [44]:
technology_subset.score.max()
Out[44]:
146.0
In [45]:
((unique_students.cochlear>0) & (unique_students.age_amp.notnull())).sum()
Out[45]:
572

Receptive Language Test Score Model

In [46]:
# Filter domain and English-only, and appropriate bilateral subset
receptive_language_dataset = technology_subset[(technology_subset.domain=='Language') 
                                    & (technology_subset.test_type=='receptive') 
                                    & (technology_subset.non_english==False)]

receptive_language_dataset.head()
Out[46]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
6    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
250  initial_assessment_arm_1     2012-2013   0     1      0          0    2   
272  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
293  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
302  initial_assessment_arm_1     2011-2012   0     1      0          0    2   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
6             2          2              8     ...                      2005   
250           4          6              7     ...                      2012   
272           6          6              8     ...                      2014   
293           5          6              7     ...                      2014   
302           6          6              8     ...                      2011   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
6                0                   1          0       False         False   
250              1                   1          1       False         False   
272              1                   1        NaN       False         False   
293              1                   1          1       False         False   
302              0                   1        NaN       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
6       0   4.500000        False        False  
250     0   4.666667         True        False  
272     0   3.583333        False        False  
293     0   4.083333        False        False  
302     0   4.416667        False        False  

[5 rows x 83 columns]

This is the size of the resulting dataset to be used in this analysis

In [47]:
receptive_language_dataset.shape
Out[47]:
(941, 83)
In [48]:
mean_score = receptive_language_dataset.groupby('study_id')['score'].mean()
In [49]:
receptive_language_dataset = receptive_language_dataset.drop_duplicates('study_id')
In [50]:
receptive_language_dataset = receptive_language_dataset.set_index('study_id')
In [51]:
receptive_language_dataset['mean_score'] = mean_score

Specify criterion to use: jcih, ident_by_3, program_by_6

In [52]:
criterion_labels = {'jcih': 'JCIH',
                   'ident_by_3': 'Diagnosed by 3 months',
                   'program_by_6': 'Intervention by 6 months'}

criterion = 'jcih'

Proportion of missing values for each variable

In [53]:
covs = ['age_years', 'school', 'score', 'male', 'time',
        'family_inv', 'sib', 'deg_hl_below6', 'mother_hs', 'synd_or_disab'] + [criterion]
(receptive_language_dataset[covs].isnull().sum() / receptive_language_dataset.shape[0]).round(2)
Out[53]:
age_years        0.00
school           0.00
score            0.00
male             0.00
time             0.04
family_inv       0.16
sib              0.06
deg_hl_below6    0.01
mother_hs        0.33
synd_or_disab    0.14
jcih             0.20
dtype: float64

Drop individuals with missing age, since there is <1% of them

In [54]:
receptive_language_dataset = receptive_language_dataset[receptive_language_dataset.age.notnull()]
In [55]:
receptive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [56]:
receptive_language_dataset.mother_ed.hist()
Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c0d9ac8>

Final analysis dataset size

In [57]:
receptive_language_dataset.shape
Out[57]:
(871, 83)
In [58]:
missing_pct = (receptive_language_dataset[covs].isnull().mean()*100).round(1)
missing_pct.sort(ascending=False)
missing_pct
Out[58]:
mother_hs        33.3
jcih             19.3
family_inv       15.8
synd_or_disab    14.0
sib               5.5
time              3.6
deg_hl_below6     1.3
male              0.0
score             0.0
school            0.0
age_years         0.0
dtype: float64
In [59]:
receptive_language_dataset['age_amp_missing'] = receptive_language_dataset.age_amp.isnull()
by_age_amp_missing = receptive_language_dataset.groupby('age_amp_missing')
In [60]:
by_age_amp_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()
Out[60]:
                     male  synd_or_disab  deg_hl_below6  mother_hs
age_amp_missing                                                   
False            0.510885       0.221106       0.469298   0.616034
True             0.478022       0.250000       0.386364   0.654206
In [61]:
receptive_language_dataset['jcih_missing'] = receptive_language_dataset.jcih.isnull()
by_jcih_missing = receptive_language_dataset.groupby('jcih_missing')
In [62]:
by_jcih_missing['male', 'synd_or_disab', 'deg_hl_below6', 'mother_hs',
                            'non_english'].mean()
Out[62]:
                  male  synd_or_disab  deg_hl_below6  mother_hs
jcih_missing                                                   
False         0.513514       0.219110       0.467811   0.626033
True          0.464286       0.260563       0.385093   0.608247
In [63]:
receptive_language_dataset.age_years.mean()
Out[63]:
2.2484691924990403
In [66]:
from pymc import Bernoulli, Normal, Uniform, invlogit, logit, deterministic, MCMC, Matplot, MAP
from pymc import Lambda, Dirichlet, Categorical, Beta, NegativeBinomial, Exponential, T
from numpy.ma import masked_values

n_covs = 11

def generate_model(dataset):
    
    # Extract data to local variables
    age_years = dataset.age_years.values
    school = dataset.school.values
    score = dataset.score.values
    non_english = dataset.non_english.astype(int).values
    male = dataset.male.values
    sib = dataset.sib.values
    synd_or_disab = dataset.synd_or_disab.values
    int_outside_option = dataset.int_outside_option.values
    synd_or_disab = dataset.synd_or_disab.values
    study_id = dataset.index.values
    ident_by_3 = dataset.ident_by_3
    program_by_6 = dataset.program_by_6
    bilateral_ci = dataset.bilateral_ci
    bimodal = dataset.bimodal
    unimodal_ci = dataset.unimodal_ci
    unimodal_ha = dataset.unimodal_ha
    
    # Transform some data
    age_std = age_years - age_years.mean()
    
    # Imputation of family involvement
    n_family_inv_cats = len(dataset.family_inv.unique())
    p_family_inv = Dirichlet("p_family_inv", np.ones(n_family_inv_cats), 
                             value=[1./n_family_inv_cats]*(n_family_inv_cats-1))
    family_inv_masked = masked_values(dataset.family_inv.fillna(0.5), value=0.5)
    x_family_inv = Categorical('x_family_inv', p_family_inv, value=family_inv_masked, 
                               observed=True)
    
    # Imputation of maternal education
    p_mother_hs = Beta("p_mother_hs", 1, 1, value=0.5)
    mother_hs_masked = masked_values(dataset.mother_hs.values, value=np.nan)
    x_mother_hs = Bernoulli('x_mother_hs', p_mother_hs, value=mother_hs_masked, observed=True)
    
    # Imputation of previous disability
    p_synd_or_disab = Beta("p_synd_or_disab", 1, 1, value=0.5)
    synd_or_disab_masked = masked_values(dataset.synd_or_disab.values, value=np.nan)
    x_synd_or_disab = Bernoulli('x_synd_or_disab', p_synd_or_disab, value=synd_or_disab_masked, 
                                observed=True)
    
    # Imputation of siblings
    n_sib_cats = len(dataset.sib.unique())
    p_sib = Dirichlet("p_sib", np.ones(n_sib_cats), value=[1./n_sib_cats]*(n_sib_cats-1))
    sib_masked = masked_values(dataset.sib.fillna(0.5), value=0.5)
    x_sib = Categorical('x_sib', p_sib, value=sib_masked, observed=True)
    
    
    mean_score = Normal("mean_score", 100, 0.001, value=80)
    
    # Indices to school random effects
    unique_schools = np.unique(school)
    school_index = [list(unique_schools).index(s) for s in school]

    # School random effect
    sigma_school = Uniform("sigma_school", 0, 1000, value=10)
    tau_school = sigma_school**-2
    alpha_school = Normal("alpha_school", mu=0, tau=tau_school, 
                          value=np.zeros(len(unique_schools)))
    
    @deterministic
    def intercept(a0=mean_score, a1=alpha_school):
        """Random intercepts"""
        return a0 + a1[school_index]
    
    # Fixed effects
    beta = Normal("beta", 0, 0.001, value=[0]*n_covs)

    @deterministic
    def theta(b0=intercept, b=beta, x_family_inv=x_family_inv, x_sib=x_sib, 
              x_mother_hs=x_mother_hs, x_synd_or_disab=x_synd_or_disab):
        
        # Complete predictors
        X = [male, ident_by_3, program_by_6, bilateral_ci, bimodal, unimodal_ci, unimodal_ha]
        # Partially-imputed predictors
        X += [x_family_inv, x_sib, x_mother_hs, x_synd_or_disab]
        return b0 + np.dot(b, X) #+ a[child_index]
    
    sigma = Uniform("sigma", 0, 1000, value=10)
    tau = sigma**-2
    
    # Data likelihood
    score_like = Normal("score_like", mu=theta, tau=tau, value=score, observed=True)
    #score_like_pred = Normal("score_like_pred", mu=theta, tau=tau, value=score)
    
    
    return locals()
In [67]:
M_reclang = MCMC(generate_model(receptive_language_dataset), 
                 db='sqlite', name='reclang', dbmode='w')
In [68]:
iterations = 100000
burn = 90000
In [69]:
M_reclang.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1195.6 sec
In [70]:
labels = ['Male', 'Identified by 3 mo.', 
          'In Program by 6 mo.', 
          'Bilateral CI', 'Bimodal', 'Unimodal CI', 'Unimodal HA',
          'Reduced Family Involvement', 
          'Sibling Count', 'Non-profound Hearing Loss',
            'Mother with HS Diploma','Additional Disability']
#                       criterion_labels[criterion]]
In [71]:
Matplot.summary_plot([M_reclang.beta], rhat=False,
        custom_labels=labels, x_range=(-15,15), main='Receptive Language')
In [73]:
M_reclang.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	-0.014           1.153            0.077            [-2.202  2.113]
	4.213            1.211            0.087            [ 1.945  6.66 ]
	7.079            1.719            0.141          [  3.793  10.512]
	-8.003           1.359            0.109          [-10.81   -5.494]
	-6.393           2.042            0.179          [-10.309  -2.882]
	-14.416          3.097            0.286          [-19.959  -8.341]
	5.658            2.812            0.26           [  0.512  11.589]
	-5.67            0.538            0.03             [-6.697 -4.584]
	-0.513           0.613            0.033            [-1.719  0.659]
	5.077            1.368            0.109            [ 2.226  7.522]
	-6.634           1.231            0.083            [-9.046 -4.23 ]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-2.172           -0.861          -0.023         0.836         2.18
	2.023            3.355           4.142          5.015         6.779
	3.7              5.866           7.092          8.216         10.464
	-10.652          -8.854          -7.942         -7.134        -5.244
	-9.999           -7.739          -6.349         -5.291        -2.133
	-20.392          -16.583         -14.364        -12.336       -8.494
	0.461            3.575           5.635          7.481         11.589
	-6.756           -6.031          -5.669         -5.296        -4.631
	-1.702           -0.914          -0.513         -0.102        0.726
	2.454            4.112           5.03           5.943         7.884
	-9.211           -7.44           -6.597         -5.8          -4.343
	

Expressive Language Model

In [74]:
expressive_language_dataset = technology_subset[(technology_subset.domain=='Language') & (technology_subset.test_type=='expressive') &
                                     (technology_subset.non_english==False)]

expressive_language_dataset.head()
Out[74]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
7    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
251  initial_assessment_arm_1     2012-2013   0     1      0          0    2   
273  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
294  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
303  initial_assessment_arm_1     2011-2012   0     1      0          0    2   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
7             2          2              8     ...                      2005   
251           4          6              7     ...                      2012   
273           6          6              8     ...                      2014   
294           5          6              7     ...                      2014   
303           6          6              8     ...                      2011   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
7                0                   1          0       False         False   
251              1                   1          1       False         False   
273              1                   1        NaN       False         False   
294              1                   1          1       False         False   
303              0                   1        NaN       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
7       0   4.500000        False        False  
251     0   4.666667         True        False  
273     0   3.583333        False        False  
294     0   4.083333        False        False  
303     0   4.416667        False        False  

[5 rows x 83 columns]
In [75]:
mean_score = expressive_language_dataset.groupby('study_id')['score'].mean()
expressive_language_dataset = expressive_language_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_language_dataset['mean_score'] = mean_score
In [76]:
(expressive_language_dataset[covs].isnull().sum() / expressive_language_dataset.shape[0]).round(2)
Out[76]:
age_years        0.00
school           0.00
score            0.00
male             0.00
time             0.04
family_inv       0.16
sib              0.06
deg_hl_below6    0.01
mother_hs        0.34
synd_or_disab    0.15
jcih             0.20
dtype: float64
In [77]:
expressive_language_dataset = expressive_language_dataset[expressive_language_dataset.age.notnull() 
                                                        & expressive_language_dataset.int_outside_option.notnull()]
In [78]:
expressive_language_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [79]:
M_explang = MCMC(generate_model(expressive_language_dataset), db='sqlite', name='explang', dbmode='w')
In [80]:
M_explang.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1161.6 sec
In [81]:
M_explang.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	-0.608           1.162            0.079            [-2.866  1.539]
	6.014            1.234            0.094            [ 3.551  8.321]
	7.239            1.917            0.161          [  2.9    10.575]
	-10.202          1.172            0.081          [-12.343  -7.766]
	-7.481           1.778            0.148          [-10.897  -4.018]
	-15.643          2.393            0.214          [-20.17  -11.248]
	4.27             2.725            0.252            [-1.022  9.669]
	-4.916           0.596            0.035            [-6.123 -3.857]
	-0.634           0.643            0.036            [-1.86   0.603]
	6.517            1.323            0.102            [ 4.018  9.17 ]
	-6.32            1.251            0.088            [-8.696 -3.707]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-2.76            -1.45           -0.615         0.184         1.713
	3.51             5.197           6.054          6.853         8.295
	3.238            5.981           7.282          8.577         10.96
	-12.577          -11.039         -10.232        -9.396        -7.899
	-10.895          -8.753          -7.463         -6.288        -3.999
	-20.448          -17.373         -15.479        -13.926       -11.352
	-0.93            2.321           4.061          6.167         9.799
	-6.09            -5.328          -4.932         -4.505        -3.77
	-1.845           -1.078          -0.645         -0.184        0.652
	4.082            5.683           6.403          7.402         9.273
	-8.829           -7.166          -6.327         -5.518        -3.79
	
In [82]:
Matplot.summary_plot([M_explang.beta], custom_labels=labels, rhat=False, main='Expressive Language', x_range=(-15,15))

Articulation Model

In [84]:
articulation_dataset = technology_subset[(technology_subset.domain=='Articulation') & 
                                     (technology_subset.non_english==False)]
articulation_dataset.head()
Out[84]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
4    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
269  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
290  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
371  initial_assessment_arm_1     2014-2015   0     0      0          0    1   
504  initial_assessment_arm_1     2014-2015   0     1      0          0    0   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
4             2          2              8     ...                      2005   
269           6          6              8     ...                      2014   
290           5          6              7     ...                      2014   
371           6          6              8     ...                      2014   
504           4          4              8     ...                      2014   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
4                0                   1          0       False         False   
269              1                   1        NaN       False         False   
290              1                   1          1       False         False   
371              0                   1        NaN        True         False   
504              1                   0          1       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
4       0   4.500000        False        False  
269     0   3.583333        False        False  
290     0   4.083333        False        False  
371     0   3.916667        False        False  
504     0   4.500000        False        False  

[5 rows x 83 columns]
In [85]:
mean_score = articulation_dataset.groupby('study_id')['score'].mean()
articulation_dataset = articulation_dataset.drop_duplicates('study_id').set_index('study_id')
articulation_dataset['mean_score'] = mean_score
In [86]:
(articulation_dataset[covs].isnull().sum() / articulation_dataset.shape[0]).round(2)
Out[86]:
age_years        0.00
school           0.00
score            0.00
male             0.00
time             0.04
family_inv       0.17
sib              0.05
deg_hl_below6    0.01
mother_hs        0.32
synd_or_disab    0.14
jcih             0.21
dtype: float64
In [87]:
articulation_dataset = articulation_dataset[articulation_dataset.age.notnull()
                                            & articulation_dataset.int_outside_option.notnull()]
In [88]:
articulation_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [89]:
M_articulation = MCMC(generate_model(articulation_dataset), db='sqlite', name='articulation', dbmode='w')
In [90]:
M_articulation.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1115.2 sec
In [91]:
Matplot.summary_plot([M_articulation.beta], 
        custom_labels=labels, rhat=False, main='Articulation', x_range=(-15,15))
In [93]:
M_articulation.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	1.025            1.393            0.099            [-1.822  3.482]
	6.268            1.601            0.125            [ 3.039  9.445]
	4.507            2.406            0.211            [-0.181  9.25 ]
	-5.6             1.654            0.132            [-8.819 -2.345]
	-3.846           2.327            0.205            [-8.176  0.582]
	-7.661           3.711            0.349          [-15.473  -1.608]
	3.64             2.489            0.217            [-1.082  9.486]
	-3.331           0.614            0.034            [-4.58  -2.136]
	0.007            0.714            0.039            [-1.352  1.393]
	5.354            1.584            0.124            [ 2.196  8.587]
	-7.866           1.433            0.103          [-10.788  -5.272]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-1.798           0.111           1.118          1.982         3.537
	2.927            5.227           6.344          7.344         9.399
	-0.165           2.872           4.555          6.079         9.415
	-8.918           -6.669          -5.574         -4.553        -2.411
	-8.119           -5.561          -3.958         -2.179        0.696
	-15.325          -10.014         -7.205         -4.965        -1.325
	-1.852           2.085           3.707          5.221         8.908
	-4.549           -3.748          -3.339         -2.93         -2.1
	-1.391           -0.483          0.023          0.495         1.365
	2.12             4.372           5.339          6.402         8.558
	-10.712          -8.899          -7.858         -6.919        -5.118
	

Expressive Vocabulary Model

In [94]:
expressive_vocab_dataset = technology_subset[(technology_subset.domain=='Expressive Vocabulary') & 
                                     (technology_subset.non_english==False)]
expressive_vocab_dataset.head()
Out[94]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
5    initial_assessment_arm_1     2005-2006   0     1      0          0    3   
19   initial_assessment_arm_1     2007-2008   0     0      0          0    3   
246  initial_assessment_arm_1     2012-2013   0     0      0          0    2   
270  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
291  initial_assessment_arm_1     2014-2015   0     0      0          0    1   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
5             2          2              8     ...                      2005   
19            4          4              8     ...                      2007   
246           6          6              8     ...                      2012   
270           6          6              8     ...                      2014   
291           5          6              7     ...                      2014   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
5                0                   1          0       False         False   
19               0                   1          1       False         False   
246              1                   1        NaN       False         False   
270              1                   1        NaN       False         False   
291              1                   1          1       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
5       0   4.500000        False        False  
19      0   4.583333        False        False  
246     0   4.000000        False        False  
270     0   3.583333        False        False  
291     0   4.083333        False        False  

[5 rows x 83 columns]
In [95]:
mean_score = expressive_vocab_dataset.groupby('study_id')['score'].mean()
expressive_vocab_dataset = expressive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
expressive_vocab_dataset['mean_score'] = mean_score
In [96]:
expressive_vocab_dataset[covs].isnull().sum()
Out[96]:
age_years          4
school             0
score              0
male               0
time              40
family_inv       153
sib               49
deg_hl_below6      8
mother_hs        312
synd_or_disab    124
jcih             200
dtype: int64
In [97]:
expressive_vocab_dataset = expressive_vocab_dataset[expressive_vocab_dataset.age.notnull() 
                                                    & expressive_vocab_dataset.male.notnull()
                                                    & expressive_vocab_dataset.int_outside_option.notnull()]
In [98]:
expressive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [99]:
M_expressive_vocab = MCMC(generate_model(expressive_vocab_dataset), db='sqlite', name='expressive_vocab', dbmode='w')
In [100]:
M_expressive_vocab.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1208.7 sec
In [101]:
Matplot.summary_plot([M_expressive_vocab.beta], 
        custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-15,15))
In [102]:
Matplot.summary_plot([M_expressive_vocab.beta], 
        custom_labels=labels, rhat=False, main='Expressive Vocabulary', x_range=(-10,10))
In [103]:
M_expressive_vocab.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	0.975            1.16             0.08             [-1.414  3.056]
	6.812            1.28             0.098            [ 4.052  9.213]
	5.863            2.085            0.177          [  1.974  10.077]
	-5.692           1.275            0.094            [-8.147 -3.308]
	-5.099           1.806            0.148            [-8.481 -1.372]
	-15.035          3.036            0.282          [-21.84   -9.861]
	6.66             2.365            0.209          [  2.757  11.604]
	-5.144           0.532            0.029            [-6.177 -4.129]
	-1.395           0.684            0.045            [-2.696 -0.007]
	6.207            1.391            0.111            [ 3.567  8.881]
	-5.2             1.222            0.093            [-7.604 -2.894]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-1.268           0.213           0.98           1.75          3.277
	4.152            6.004           6.826          7.654         9.359
	1.191            4.751           5.903          7.196         9.834
	-8.101           -6.568          -5.659         -4.84         -3.217
	-8.628           -6.282          -5.102         -3.921        -1.378
	-21.495          -16.902         -14.941        -12.865       -9.248
	2.254            5.042           6.594          8.32          11.314
	-6.209           -5.481          -5.156         -4.793        -4.141
	-2.719           -1.868          -1.406         -0.943        -0.013
	3.658            5.193           6.143          7.181         9.14
	-7.714           -6.039          -5.138         -4.33         -2.985
	
In [104]:
Matplot.summary_plot(M_explang.alpha_school, custom_labels=[' ']*42, main='Expressive language school effects', 
                     x_range=(-25, 25))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.

Receptive Vocabulary Model

In [106]:
receptive_vocab_dataset = technology_subset[(technology_subset.domain=='Receptive Vocabulary') & 
                                     (technology_subset.non_english==False)]
receptive_vocab_dataset.head()
Out[106]:
            redcap_event_name academic_year  hl  male  _race  prim_lang  sib  \
20   initial_assessment_arm_1     2007-2008   0     0      0          0    3   
82   initial_assessment_arm_1     2009-2010   0     0      0          0    2   
247  initial_assessment_arm_1     2012-2013   0     0      0          0    2   
271  initial_assessment_arm_1     2014-2015   0     0      2          0    2   
292  initial_assessment_arm_1     2014-2015   0     0      0          0    1   

     _mother_ed  father_ed  premature_age     ...       academic_year_start  \
20            4          4              8     ...                      2007   
82            4          4              8     ...                      2009   
247           6          6              8     ...                      2012   
271           6          6              8     ...                      2014   
292           5          6              7     ...                      2014   

     deg_hl_below6  int_outside_option  mother_hs  ident_by_3  program_by_6  \
20               0                   1          1       False         False   
82               0                   1          1       False         False   
247              1                   1        NaN       False         False   
271              1                   1        NaN       False         False   
292              1                   1          1       False         False   

     jcih  age_years  unimodal_ci  unimodal_ha  
20      0   4.583333        False        False  
82      0   3.833333        False        False  
247     0   4.000000        False        False  
271     0   3.583333        False        False  
292     0   4.083333        False        False  

[5 rows x 83 columns]
In [107]:
mean_score = receptive_vocab_dataset.groupby('study_id')['score'].mean()
receptive_vocab_dataset = receptive_vocab_dataset.drop_duplicates('study_id').set_index('study_id')
receptive_vocab_dataset['mean_score'] = mean_score
In [108]:
receptive_vocab_dataset[covs].isnull().sum()
Out[108]:
age_years          4
school             0
score              0
male               0
time              37
family_inv       158
sib               49
deg_hl_below6     10
mother_hs        318
synd_or_disab    125
jcih             202
dtype: int64
In [109]:
receptive_vocab_dataset = receptive_vocab_dataset[receptive_vocab_dataset.age.notnull() & 
                                                  receptive_vocab_dataset.male.notnull() &
                                                  receptive_vocab_dataset.int_outside_option.notnull()]
In [110]:
receptive_vocab_dataset.age_int.hist(bins=40)
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x108826da0>
In [111]:
receptive_vocab_dataset.shape
Out[111]:
(922, 83)
In [112]:
receptive_vocab_dataset.score.hist(grid=False, bins=25, figsize=(10, 6))
plt.xlabel('Standard Scores'); plt.ylabel('Frequency');
In [113]:
M_receptive_vocab = MCMC(generate_model(receptive_vocab_dataset), db='sqlite', name='receptive_vocab', dbmode='w')
In [114]:
M_receptive_vocab.sample(iterations, burn)
 [-----------------100%-----------------] 100000 of 100000 complete in 1169.1 sec
In [115]:
#order = np.argsort(M.beta.stats()['mean'])[::-1]
Matplot.summary_plot([M_receptive_vocab.beta], rhat=False,
        custom_labels=labels, main='Receptive Vocabulary', x_range=(-15,15))
In [117]:
M_receptive_vocab.beta.summary()
beta:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	0.385            1.001            0.06             [-1.625  2.257]
	4.984            1.173            0.087              [ 2.65  7.07]
	5.367            1.732            0.145            [ 1.841  8.586]
	-5.109           1.221            0.095            [-7.366 -2.551]
	-6.193           1.888            0.166            [-9.57  -2.332]
	-14.416          2.514            0.232          [-18.62   -9.371]
	6.853            2.164            0.194          [  2.9    11.565]
	-4.353           0.525            0.031            [-5.261 -3.238]
	-1.132           0.564            0.031            [-2.225 -0.051]
	5.11             1.187            0.088            [ 2.992  7.523]
	-5.47            1.163            0.086            [-7.699 -2.975]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	-1.611           -0.278          0.38           1.033         2.308
	2.806            4.194           4.921          5.804         7.326
	1.725            4.307           5.374          6.584         8.488
	-7.774           -5.886          -5.104         -4.253        -2.817
	-9.926           -7.507          -6.054         -4.984        -2.559
	-18.693          -16.396         -14.435        -12.781       -9.379
	2.521            5.466           6.854          8.135         11.346
	-5.348           -4.725          -4.348         -4.007        -3.295
	-2.252           -1.522          -1.125         -0.751        -0.052
	2.763            4.226           5.077          5.989         7.354
	-7.777           -6.235          -5.483         -4.715        -3.03
	

Database queries

In [118]:
import sqlite3
In [119]:
db_files = !ls *sqlite
In [120]:
articulation = sqlite3.connect("articulation.sqlite")
In [121]:
c_artic = articulation.cursor()
In [122]:
c_artic.execute('SELECT v1 FROM sigma_school WHERE trace==0')
Out[122]:
<sqlite3.Cursor at 0x11a8de6c0>
In [123]:
sigma_school_artic = np.ravel(c_artic.fetchall())
In [124]:
sigma_school_artic.mean()
Out[124]:
5.7923510709000006
In [125]:
for dbname in db_files:
    db = sqlite3.connect(dbname)
    cur = db.cursor()
    cur.execute('SELECT v1 FROM sigma_school WHERE trace==0')
    trace = np.ravel(cur.fetchall())
    trace.sort()
    print('\n'+dbname)
    print(np.median(trace))
    print(trace[[int(len(trace)*0.025), int(len(trace)*0.975)]])
articulation.sqlite
5.739025
[ 3.090618  8.920649]

explang.sqlite
4.945318
[ 3.390639  7.225201]

expressive_vocab.sqlite
8.008659
[  6.058134  10.607475]

receptive_vocab.sqlite
6.532987
[ 4.738706  9.103903]

reclang.sqlite
5.1217285
[ 3.251533  7.165217]