In [17]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
from pylab import *
In [18]:
# kernel density estimation
from scipy.stats import gaussian_kde

Apply grayscale first, seaborn whitegrid second to get greyscale plots.

In [19]:
grayscale = True

if grayscale:
    plt.style.use('grayscale')
In [20]:
print(style.available)
['seaborn-darkgrid', 'seaborn-whitegrid', 'ggplot', 'seaborn-pastel', 'seaborn-poster', 'seaborn-deep', 'seaborn-notebook', 'seaborn-white', 'seaborn-dark-palette', 'seaborn-paper', 'bmh', 'seaborn-dark', 'seaborn-colorblind', 'dark_background', 'grayscale', 'classic', 'seaborn-ticks', 'seaborn-talk', 'fivethirtyeight', 'seaborn-bright', 'seaborn-muted']
In [21]:
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
In [22]:
CHRIS = True
In [23]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
if not CHRIS: 
    api_key = open("/Users/alicetoll/Documents/OPTION/token.txt").read()
else:
    api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)
In [24]:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})
In [25]:
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})
In [26]:
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})
In [27]:
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})
In [28]:
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning: Columns (25,31,41) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

Language

5 language measures:

  • 3 versions of CELF
  • PLS
    • pls_ac_rs: PLS: Auditory Comprehension Raw Score
    • pls_ac_ss: PLS: Auditory Comprehension Standard Score
    • pls_ec_rs: PLS: Expressive Communication Raw Score
    • pls_ec_ss: PLS: Expressive Communication Standard Score
    • pls_tl_rs: PLS: Total Language Score Standard Score Total
    • pls_tl_ss: PLS: Total Language Score Standard Score
  • OWLS
    • age_test_owls: Age at time of testing (OWLS)
    • owls_lc_rs: OWLS: Listening Comprehension Raw Score
    • owls_lc_ss: OWLS: Listening Comprehension Standard Score
    • owls_oe_rs: OWLS: Oral Expression Raw Score
    • owls_oe_ss: OWLS: Oral Expression Standard Score
    • owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
    • owls_oc_ss: OWLS: Oral Composite Standard Score
    • owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
    • owls_wes_as: OWLS: Written Expression Scale Ability Score
    • owls_wes_ss: OWLS: Written Expression Scale Standard Score
    • owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
    • owls_lcss: OWLS: Language Composite Standard Score
In [29]:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type  expressive  receptive
test_name                       
CELF-4            659        565
CELF-P2          1722       1728
OWLS             1259       1267
PLS              4072       4083
There are 0 null values for score
In [30]:
language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
Out[30]:
study_id redcap_event_name score test_type test_name school age_test domain
0 0101-2002-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language
5 0101-2002-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language
In [31]:
language['ageGroup'] = None # initial variable to none
language.loc[(language.age_test >= 36) & (language.age_test < 48), 'ageGroup'] = 3 
language.loc[(language.age_test >= 48) & (language.age_test < 60), 'ageGroup'] = 4 
language.loc[(language.age_test >= 60) & (language.age_test < 72), 'ageGroup'] = 5 
language.head()
Out[31]:
study_id redcap_event_name score test_type test_name school age_test domain ageGroup
0 0101-2002-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language 4
5 0101-2002-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language None
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language 3
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language 4
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language 5

Expressive Language

In [32]:
if CHRIS:
    !mkdir DescriptiveFigures
mkdir: DescriptiveFigures: File exists
In [33]:
expressive_lang = language.loc[language.test_type=='expressive'].copy()
expressive_lang.head()
Out[33]:
study_id redcap_event_name score test_type test_name school age_test domain ageGroup
0 0101-2002-0101 initial_assessment_arm_1 60 expressive PLS 0101 54 Language 4
5 0101-2002-0101 year_5_complete_71_arm_1 82 expressive OWLS 0101 113 Language None
9 0101-2003-0102 initial_assessment_arm_1 68 expressive PLS 0101 44 Language 3
10 0101-2003-0102 year_1_complete_71_arm_1 77 expressive PLS 0101 54 Language 4
11 0101-2003-0102 year_2_complete_71_arm_1 85 expressive CELF-P2 0101 68 Language 5

Create models for converting scores

In [34]:
expressive_scores = (expressive_lang.groupby(['study_id','ageGroup'])
                     .apply(lambda x: x.pivot(columns='test_name', values='score')))
In [35]:
expressive_scores = (expressive_scores.set_index(expressive_scores.index.droplevel(2))
                         .reset_index()
                        .groupby(['study_id','ageGroup'])).apply(max)
In [36]:
expressive_owls_pls = expressive_scores[['OWLS', 'PLS']].dropna()
expressive_owls_pls.plot.scatter('OWLS', 'PLS')
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x111e49908>
In [37]:
from sklearn import linear_model
reg_owls = linear_model.LinearRegression()
reg_owls.fit(expressive_owls_pls.OWLS.values.reshape(-1,1), expressive_owls_pls.PLS.values)
Out[37]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [38]:
expressive_lang.test_name.value_counts()
Out[38]:
PLS        4072
CELF-P2    1722
OWLS       1259
CELF-4      659
Name: test_name, dtype: int64
In [39]:
expressive_lang['old_score'] = expressive_lang.score.copy()
pred_vals = reg_owls.predict(expressive_lang[expressive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='OWLS', 'score'] = pred_vals
In [40]:
expressive_celf_pls = expressive_scores[['CELF-P2', 'PLS']].dropna()
expressive_celf_pls.plot.scatter('CELF-P2', 'PLS')
Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x1134b3fd0>
In [41]:
reg_celf = linear_model.LinearRegression()
reg_celf.fit(expressive_celf_pls['CELF-P2'].values.reshape(-1,1), expressive_celf_pls.PLS.values)
Out[41]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [42]:
pred_vals = reg_celf.predict(expressive_lang[expressive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='CELF-P2', 'score'] = pred_vals
In [43]:
expressive_scores[['CELF-4', 'PLS']].dropna().plot.scatter('CELF-4', 'PLS')
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x1122b26a0>

There arent enough points to fit a model for CELF-2, so we will combine these directly.

In [44]:
# Test type
expressive_lang["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman                 5728
Arizonia                 525
Arizonia and Goldman      91
Name: test_type, dtype: int64
There are 0 null values for test_type
In [45]:
plt.style.use('grayscale')
sns.set_style("whitegrid")
bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
    y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)
    
expressive_lang.ageGroup.value_counts()
# expressive_lang.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
Out[45]:
3    1590
4    1572
5    1118
Name: ageGroup, dtype: int64
In [46]:
bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
    y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)
In [47]:
# need to do this for some reason...
expressive_lang['scoreInt'] = np.array(expressive_lang.score, dtype = 'int')
expressive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])
Out[47]:
mean median std len
ageGroup
3 82.952201 82 15.582549 1590
4 81.650127 81 16.563240 1572
5 80.762970 80 18.071186 1118

Receptive Language

In [48]:
receptive_lang = language.loc[language.test_type=='receptive'].copy()
receptive_lang.head()
Out[48]:
study_id redcap_event_name score test_type test_name school age_test domain ageGroup
0 0101-2002-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language 4
5 0101-2002-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language None
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language 3
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language 4
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language 5
In [49]:
receptive_lang.test_name.value_counts()
Out[49]:
PLS        4083
CELF-P2    1728
OWLS       1267
CELF-4      565
Name: test_name, dtype: int64
In [50]:
receptive_scores = (receptive_lang.groupby(['study_id','ageGroup'])
                     .apply(lambda x: x.pivot(columns='test_name', values='score')))
In [51]:
receptive_scores = (receptive_scores.set_index(receptive_scores.index.droplevel(2))
                         .reset_index()
                        .groupby(['study_id','ageGroup'])).apply(max)
In [52]:
receptive_owls_pls = receptive_scores[['OWLS', 'PLS']].dropna()
receptive_owls_pls.plot.scatter('OWLS', 'PLS')
Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x1157340b8>
In [53]:
receptive_celfp2_pls = receptive_scores[['CELF-P2', 'PLS']].dropna()
receptive_celfp2_pls.plot.scatter('CELF-P2', 'PLS')
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x113525668>
In [54]:
receptive_celf4_pls = receptive_scores[['CELF-4', 'PLS']].dropna()
receptive_celf4_pls.plot.scatter('CELF-4', 'PLS')
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x115449828>
In [55]:
reg_owls = linear_model.LinearRegression()
reg_owls.fit(receptive_owls_pls.OWLS.values.reshape(-1,1), receptive_owls_pls.PLS.values)
Out[55]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [56]:
receptive_lang['old_score'] = receptive_lang.score.copy()
pred_vals = reg_owls.predict(receptive_lang[receptive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='OWLS', 'score'] = pred_vals
In [57]:
reg_celf = linear_model.LinearRegression()
reg_celf.fit(receptive_celfp2_pls['CELF-P2'].values.reshape(-1,1), receptive_celfp2_pls.PLS.values)
Out[57]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [58]:
pred_vals = reg_celf.predict(receptive_lang[receptive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='CELF-P2', 'score'] = pred_vals
In [59]:
receptive_lang['scoreInt'] = np.array(receptive_lang.score, dtype = 'int')
In [60]:
bp = receptive_lang.boxplot(column='scoreInt', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Language')
for i in [1,2,3]:
    y = receptive_lang.score[receptive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recLang.png', dpi=300)

receptive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])
Out[60]:
mean median std len
ageGroup
3 83.541509 84 18.573121 1590
4 83.525649 84 17.976144 1579
5 82.148858 81 17.876704 1095

Articulation

In [61]:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman                 5728
Arizonia                 525
Arizonia and Goldman      91
Name: test_type, dtype: int64
There are 0 null values for test_type

Map Arizonia onto Goldman

In [62]:
# create indicator variable if a student took either test
articulation.loc[articulation.aaps_ss.notnull() | articulation.gf2_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = articulation.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.aaps_ss.notnull() & temp.gf2_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()
0    6253
1      91
Name: both, dtype: int64
In [63]:
AAPS = temp.aaps_ss.notnull()
GF2 = temp.gf2_ss.notnull()

temp.loc[AAPS, "test_name"] = "AAPS"
temp.loc[GF2, "test_name"] = "GF2"
In [64]:
# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')
We have 6253 observations where a student took one test in a single year, but only 2932 unique students
We have 91 observations where a student took both test in a single year, but only 62 unique students
In [65]:
reg = linear_model.LinearRegression()
reg.fit(both.aaps_ss.values.reshape(-1,1), both.gf2_ss.values)
Out[65]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [66]:
articulation['old_score'] = articulation.score.copy()
pred_vals = reg.predict(articulation[articulation.test_type=='Arizonia'].score.values.reshape(-1,1))
articulation.loc[articulation.test_type=='Arizonia', 'score'] = pred_vals
In [67]:
articulation["school"] = articulation.study_id.str.slice(0,4)
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count    6340.000000
mean       68.697950
std        30.557327
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64
In [68]:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
Out[68]:
study_id redcap_event_name test_type score test old_score school age_test domain
1 0101-2002-0101 year_1_complete_71_arm_1 Goldman 78.0 1.0 78.0 0101 80.0 Articulation
9 0101-2003-0102 initial_assessment_arm_1 Goldman 72.0 1.0 72.0 0101 44.0 Articulation
10 0101-2003-0102 year_1_complete_71_arm_1 Goldman 97.0 1.0 97.0 0101 54.0 Articulation
14 0101-2004-0101 year_2_complete_71_arm_1 Goldman 75.0 1.0 75.0 0101 53.0 Articulation
15 0101-2004-0101 year_3_complete_71_arm_1 Goldman 80.0 1.0 80.0 0101 66.0 Articulation
In [69]:
articulation['ageGroup'] = None # initial variable to none
articulation.loc[(articulation.age_test >= 36) & (articulation.age_test < 48), 'ageGroup'] = 3 
articulation.loc[(articulation.age_test >= 48) & (articulation.age_test < 60), 'ageGroup'] = 4 
articulation.loc[(articulation.age_test >= 60) & (articulation.age_test < 72), 'ageGroup'] = 5 

bp = articulation.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Articulation')
for i in [1,2,3]:
    y = articulation.score[articulation.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/artic.png', dpi=300)

articulation.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
Out[69]:
mean median std len
ageGroup
3 83.422264 84.0 18.680816 1281.0
4 82.868699 85.0 21.087332 1489.0
5 82.219630 86.0 21.043017 1131.0

Expressive Vocabulary

In [70]:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
In [71]:
expressive.test_type.value_counts()
Out[71]:
EVT               4271
EOWPVT            3087
EOWPVT and EVT     176
Name: test_type, dtype: int64

Map EVT to EOWPVT

In [72]:
# create indicator variable if a student took either test
expressive.loc[expressive.evt_ss.notnull() | expressive.eowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = expressive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.evt_ss.notnull() & temp.eowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()
0    7358
1     176
Name: both, dtype: int64
In [73]:
EVT = temp.evt_ss.notnull()
EOWPVT = temp.eowpvt_ss.notnull()

temp.loc[EVT, "test_name"] = "EVT"
temp.loc[EOWPVT, "test_name"] = "EOWPVT"
In [74]:
# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')
We have 7358 observations where a student took one test in a single year, but only 3333 unique students
We have 176 observations where a student took both test in a single year, but only 127 unique students
In [75]:
del temp
In [76]:
reg = linear_model.LinearRegression()
reg.fit(both.evt_ss.values.reshape(-1,1), both.eowpvt_ss.values)
Out[76]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [77]:
expressive['old_score'] = expressive.score.copy()
pred_vals = reg.predict(expressive[expressive.test_type=='EVT'].score.values.reshape(-1,1))
expressive.loc[expressive.test_type=='EVT', 'score'] = pred_vals
In [78]:
expressive["school"] = expressive.study_id.str.slice(0,4)
In [79]:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
In [80]:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
Out[80]:
study_id redcap_event_name score test_type school age_test domain
0 0101-2002-0101 initial_assessment_arm_1 58.0 EOWPVT 0101 54.0 Expressive Vocabulary
2 0101-2002-0101 year_2_complete_71_arm_1 84.0 EOWPVT 0101 80.0 Expressive Vocabulary
5 0101-2002-0101 year_5_complete_71_arm_1 90.0 EOWPVT 0101 113.0 Expressive Vocabulary
14 0101-2004-0101 year_2_complete_71_arm_1 90.0 EOWPVT 0101 53.0 Expressive Vocabulary
15 0101-2004-0101 year_3_complete_71_arm_1 87.0 EOWPVT 0101 66.0 Expressive Vocabulary
In [81]:
expressive['ageGroup'] = None # initial variable to none
expressive.loc[(expressive.age_test >= 36) & (expressive.age_test < 48), 'ageGroup'] = 3 
expressive.loc[(expressive.age_test >= 48) & (expressive.age_test < 60), 'ageGroup'] = 4 
expressive.loc[(expressive.age_test >= 60) & (expressive.age_test < 72), 'ageGroup'] = 5 

bp = expressive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Vocabulary')
for i in [1,2,3]:
    y = expressive.score[expressive.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expVocab.png', dpi=300)

expressive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
Out[81]:
mean median std len
ageGroup
3 90.898049 92.770977 18.628653 1542.0
4 90.023118 91.000000 18.967224 1726.0
5 89.124085 90.000000 17.289723 1283.0

Receptive Vocabulary

In [82]:
receptive.columns
Out[82]:
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')
In [83]:
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type

Map PPVT onto ROWPVT

In [84]:
# create indicator variable if a student took either test
receptive.loc[receptive.ppvt_ss.notnull() | receptive.rowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = receptive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.ppvt_ss.notnull() & temp.rowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()
0    7497
1     228
Name: both, dtype: int64
In [85]:
PPVT = temp.ppvt_ss.notnull()
ROWPVT = temp.rowpvt_ss.notnull()

temp.loc[PPVT, "test_name"] = "PPVT"
temp.loc[ROWPVT, "test_name"] = "ROWPVT"
In [86]:
# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')
We have 7497 observations where a student took one test in a single year, but only 3364 unique students
We have 228 observations where a student took both test in a single year, but only 157 unique students
In [87]:
del temp
In [88]:
rv_reg = linear_model.LinearRegression()
rv_reg.fit(both.ppvt_ss.values.reshape(-1,1), both.rowpvt_ss.values)
Out[88]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [89]:
receptive['old_score'] = receptive.score.copy()
pred_vals = reg.predict(receptive[receptive.test_type=='PPVT'].score.values.reshape(-1,1))
receptive.loc[receptive.test_type=='PPVT', 'score'] = pred_vals
In [90]:
receptive["school"] = receptive.study_id.str.slice(0,4)
In [91]:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
In [92]:
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 23 null values for age_test
In [93]:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
Out[93]:
study_id redcap_event_name score test_type school age_test domain
2 0101-2002-0101 year_2_complete_71_arm_1 86.408510 PPVT 0101 80.0 Receptive Vocabulary
5 0101-2002-0101 year_5_complete_71_arm_1 101.000000 ROWPVT 0101 113.0 Receptive Vocabulary
9 0101-2003-0102 initial_assessment_arm_1 58.572717 PPVT 0101 44.0 Receptive Vocabulary
10 0101-2003-0102 year_1_complete_71_arm_1 78.455426 PPVT 0101 54.0 Receptive Vocabulary
11 0101-2003-0102 year_2_complete_71_arm_1 95.156903 PPVT 0101 68.0 Receptive Vocabulary
In [94]:
receptive.study_id.unique().shape
Out[94]:
(3404,)
In [95]:
receptive['ageGroup'] = None # initial variable to none
receptive.loc[(receptive.age_test >= 36) & (receptive.age_test < 48), 'ageGroup'] = 3 
receptive.loc[(receptive.age_test >= 48) & (receptive.age_test < 60), 'ageGroup'] = 4 
receptive.loc[(receptive.age_test >= 60) & (receptive.age_test < 72), 'ageGroup'] = 5 
In [96]:
bp = receptive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Vocabulary')
for i in [1,2,3]:
    y = receptive.score[receptive.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recVocab.png', dpi=300)

receptive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
Out[96]:
mean median std len
ageGroup
3 89.766914 90.385052 16.611238 1603.0
4 88.455310 89.589744 17.323336 1756.0
5 87.784735 88.794435 15.319930 1312.0
In [97]:
width = 3
sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==4].score), bw=width, label = "4-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==5].score), bw=width, label = "5-year-olds")
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x115c18cc0>
In [98]:
receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=2)
# plt.legend(bbox_to_anchor=(1.05, 1),loc=2, borderaxespad=0.)
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2),
#           ncol=3, fancybox=True, shadow=True)
Out[98]:
<matplotlib.legend.Legend at 0x1190c5320>

Merge Datasets

In [99]:
test_scores = pd.concat([articulation, expressive, receptive, language])
test_scores.head()
Out[99]:
ageGroup age_test domain old_score redcap_event_name school score study_id test test_name test_type
1 None 80 Articulation 78.0 year_1_complete_71_arm_1 0101 78 0101-2002-0101 1.0 NaN Goldman
9 3 44 Articulation 72.0 initial_assessment_arm_1 0101 72 0101-2003-0102 1.0 NaN Goldman
10 4 54 Articulation 97.0 year_1_complete_71_arm_1 0101 97 0101-2003-0102 1.0 NaN Goldman
14 4 53 Articulation 75.0 year_2_complete_71_arm_1 0101 75 0101-2004-0101 1.0 NaN Goldman
15 5 66 Articulation 80.0 year_3_complete_71_arm_1 0101 80 0101-2004-0101 1.0 NaN Goldman
In [100]:
print(test_scores.test_type.value_counts())
print(test_scores.domain.value_counts())
expressive              7712
receptive               7643
Goldman                 5728
PPVT                    4893
EVT                     4271
EOWPVT                  3087
ROWPVT                  2604
Arizonia                 525
PPVT and ROWPVT          228
EOWPVT and EVT           176
Arizonia and Goldman      91
Name: test_type, dtype: int64
Language                 15355
Receptive Vocabulary      7725
Expressive Vocabulary     7534
Articulation              6344
Name: domain, dtype: int64
In [101]:
# make a new domain variable that contains only 5 categories
test_scores['domain2'] = test_scores.domain.copy()
test_scores.loc[test_scores.test_type == 'expressive', 'domain2'] = 'Expressive Language'
test_scores.loc[test_scores.test_type == 'receptive', 'domain2'] = 'Receptive Language'
test_scores.domain2.value_counts()
Out[101]:
Receptive Vocabulary     7725
Expressive Language      7712
Receptive Language       7643
Expressive Vocabulary    7534
Articulation             6344
Name: domain2, dtype: int64
In [102]:
# make a dataframe that contains only 3, 4, and 5-year-olds
test_scores_345 = test_scores.loc[(test_scores.age_test >= 36) & (test_scores.age_test < 72), ]
test_scores_345.domain2.value_counts()
Out[102]:
Receptive Vocabulary     4671
Expressive Vocabulary    4551
Expressive Language      4280
Receptive Language       4264
Articulation             3901
Name: domain2, dtype: int64
In [124]:
# Attempt to make black and white
bp = test_scores_345.boxplot(column='score', by='domain2', grid=False, sym='')
plt.title('Test scores for 3, 4, and 5-year-olds by functional outcome')
plt.suptitle("")
plt.xlabel(''); plt.ylabel('Standard score');
plt.xticks([1, 2, 3, 4, 5], ['Articulation', 'Expressive\nLanguage', 'Expressive\nVocabulary',
                            'Receptive\nLanguage', 'Receptive\nVocabulary'])
# Articulation
y = test_scores_345.score[test_scores_345.domain2=='Articulation'].dropna()
x = np.random.normal(1, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)

# Expressive Language
y = test_scores_345.score[test_scores_345.domain2=='Expressive Language'].dropna()
x = np.random.normal(2, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

# Expressive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Expressive Vocabulary'].dropna()
x = np.random.normal(3, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)  

# Receptive Language
y = test_scores_345.score[test_scores_345.domain2=='Receptive Language'].dropna()
x = np.random.normal(4, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

# Receptive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Receptive Vocabulary'].dropna()
x = np.random.normal(5, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

plt.axhline(y=85)

# for components in bp.keys():
#    for line in bp[components]:
#        line.set_color('black')     # black lines
Out[124]:
<matplotlib.lines.Line2D at 0x114e37e10>

Histograms

  • Blue = three year olds
  • Green = four year olds
  • Red = five year olds

Need to make in black and white.
Need to make legend

Expressive Language

In [104]:
sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=False, hist=True, norm_hist=False);
In [105]:
fig = sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=True, hist=False)
plt.yticks(fig.get_yticks(), fig.get_yticks() * 6735)
plt.ylabel('Counts', fontsize=16)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[105]:
<matplotlib.text.Text at 0x11901ce10>
In [106]:
sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
plt.ylabel('Density')
plt.yticks([0, 0.01, 0.02]);
# plt.yticks(fig.get_yticks(), fig.get_yticks() * 100)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
In [107]:
def kdeplot(domain, width, score='score'):
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==3, score]), bw=width, label = "3-year-olds")
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==4, score]), bw=width, label = "4-year-olds")
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==5, score]), bw=width, label = "5-year-olds")
    plt.ylabel('Density')
In [108]:
kdeplot(expressive_lang, 4)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
In [109]:
expressive_lang[expressive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
expressive_lang[expressive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
expressive_lang[expressive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=1)
Out[109]:
<matplotlib.legend.Legend at 0x113fa9d30>

Receptive Language

In [110]:
kdeplot(receptive_lang, 4)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j