In [1]:

# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:

from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)

In [3]:

metadata = lsl_dr_project.export_metadata()

In [4]:

# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:

articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})

In [6]:

records = lsl_dr_project.export_records(fields=articulation_fields)

In [7]:

print(records[0]['study_id'])

0101-2003-0101

In [8]:

expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})

In [9]:

receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})

In [10]:

language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})

In [11]:

demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})

In [12]:

demographic_raw[demographic_raw.study_id=='1147-2010-0064']

Out[12]:

	study_id	redcap_event_name	academic_year	gender	race	prim_lang	sib	mother_ed	father_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
11679	1147-2010-0064	initial_assessment_arm_1	2010-2011	0	0	0	1	3	3	...	3	6	65	0	NaN	NaN	NaN	NaN	NaN	NaN
11680	1147-2010-0064	year_1_complete_71_arm_1	2011-2012	NaN	NaN	NaN	NaN	NaN	NaN	...	3	5	77	2	NaN	NaN	NaN	NaN	NaN	NaN
11681	1147-2010-0064	year_2_complete_71_arm_1	2012-2013	NaN	NaN	NaN	NaN	NaN	NaN	...	3	5	89	2	NaN	NaN	NaN	NaN	NaN	NaN
11682	1147-2010-0064	year_3_complete_71_arm_1	2013-2014	NaN	NaN	NaN	NaN	NaN	NaN	...	4	5	101	2	NaN	NaN	NaN	NaN	NaN	NaN

4 rows × 46 columns

Attendance information¶

Several fields in the demographic data have missing values.

In [13]:

demographic_raw.head()

Out[13]:

	study_id	redcap_event_name	academic_year	gender	race	prim_lang	sib	mother_ed	father_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
0	0101-2003-0101	initial_assessment_arm_1	2002-2003	0	0	0	1	6	6	...	2	2	54	2	NaN	NaN	NaN	NaN	NaN	NaN
1	0101-2003-0101	year_1_complete_71_arm_1	2003-2004	NaN	NaN	NaN	NaN	NaN	NaN	...	4	4	80	1	NaN	NaN	NaN	NaN	NaN	NaN
2	0101-2003-0101	year_2_complete_71_arm_1	2004-2005	NaN	NaN	NaN	NaN	NaN	NaN	...	4	4	80	2	NaN	NaN	NaN	NaN	NaN	NaN
3	0101-2003-0101	year_3_complete_71_arm_1	2005-2006	NaN	NaN	NaN	NaN	NaN	NaN	...	5	5	96	3	NaN	NaN	NaN	NaN	NaN	NaN
4	0101-2003-0101	year_4_complete_71_arm_1	2006-2007	NaN	NaN	NaN	NaN	NaN	NaN	...	5	5	109	2	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 46 columns

We can fill missing values forward from previous observation (by study_id)

In [14]:

demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id

Random check to make sure this worked

In [15]:

demographic[demographic.study_id=='1147-2010-0064']

Out[15]:

	redcap_event_name	academic_year	sib	mother_ed	father_ed	premature_age	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
11679	initial_assessment_arm_1	2010-2011	1	3	3	8	...	6	65	0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
11680	year_1_complete_71_arm_1	2011-2012	1	3	3	8	...	5	77	2	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
11681	year_2_complete_71_arm_1	2012-2013	1	3	3	8	...	5	89	2	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
11682	year_3_complete_71_arm_1	2013-2014	1	3	3	8	...	5	101	2	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064

4 rows × 46 columns

Demographic data without missing values:

In [16]:

demographic.head()

Out[16]:

	redcap_event_name	academic_year	gender	race	sib	mother_ed	father_ed	premature_age	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
8319	initial_assessment_arm_1	2012-2013	0	6	0	6	6	3	...	1	9	1	NaN	NaN	NaN	NaN	NaN	NaN	0735-2012-0008
5035	initial_assessment_arm_1	2007-2008	0	0	2	6	6	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0522-2008-0011
8314	initial_assessment_arm_1	2011-2012	1	0	1	4	6	2	...	1	7	0	NaN	NaN	NaN	NaN	NaN	NaN	0735-2012-0006
8310	initial_assessment_arm_1	2011-2012	1	8	2	6	6	8	...	1	2	1	NaN	NaN	NaN	NaN	NaN	NaN	0735-2012-0005
5038	initial_assessment_arm_1	2008-2009	0	0	1	2	4	9	...	3	62	4	NaN	NaN	NaN	NaN	NaN	NaN	0522-2008-0012

5 rows × 46 columns

Cleaning languge dataset¶

5 language measures:

3 versions of CELF
PLS
- pls_ac_rs: PLS: Auditory Comprehension Raw Score
- pls_ac_ss: PLS: Auditory Comprehension Standard Score
- pls_ec_rs: PLS: Expressive Communication Raw Score
- pls_ec_ss: PLS: Expressive Communication Standard Score
- pls_tl_rs: PLS: Total Language Score Standard Score Total
- pls_tl_ss: PLS: Total Language Score Standard Score
OWLS
- age_test_owls: Age at time of testing (OWLS)
- owls_lc_rs: OWLS: Listening Comprehension Raw Score
- owls_lc_ss: OWLS: Listening Comprehension Standard Score
- owls_oe_rs: OWLS: Oral Expression Raw Score
- owls_oe_ss: OWLS: Oral Expression Standard Score
- owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
- owls_oc_ss: OWLS: Oral Composite Standard Score
- owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
- owls_wes_as: OWLS: Written Expression Scale Ability Score
- owls_wes_ss: OWLS: Written Expression Scale Standard Score
- owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
- owls_lcss: OWLS: Language Composite Standard Score

In [17]:

# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))

test_type  expressive  receptive
test_name                       
CELF-4            539        489
CELF-P2          1170       1176
OWLS              871        877
PLS              2887       2896
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [18]:

language["school"] = language.study_id.str.slice(0,4)

In [19]:

language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()

Out[19]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain
0	0101-2003-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language
5	0101-2003-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language

Cleaning articulation dataset¶

We converted the articulation dataset into a "long" format:

In [20]:

# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]

Goldman                 4254
Arizonia                 490
Arizonia and Goldman      49
dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [21]:

articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [22]:

articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())

count    4790.000000
mean       69.175365
std        31.206700
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [23]:

articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()

Out[23]:

	study_id	redcap_event_name	test_type	score	school	age_test	domain
1	0101-2003-0101	year_1_complete_71_arm_1	Goldman	78	0101	80	Articulation
9	0101-2003-0102	initial_assessment_arm_1	Goldman	72	0101	44	Articulation
10	0101-2003-0102	year_1_complete_71_arm_1	Goldman	97	0101	54	Articulation
14	0101-2004-0101	year_2_complete_71_arm_1	Goldman	75	0101	53	Articulation
15	0101-2004-0101	year_3_complete_71_arm_1	Goldman	80	0101	66	Articulation

Cleaning demographic dataset¶

We excluded unwanted columns and rows for which age, gender or race were missing:

In [24]:

# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [25]:

demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))

False    9677
True     2089
dtype: int64
There are 714 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

0=no high school diploma
1=high school
2=undergraduate
3=graduate

Category 6 (unknown) was recoded as missing.

In [26]:

demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))

_mother_ed:
6    4293
4    2485
3    1693
5    1282
2    1115
1     421
0     156
dtype: int64
mother_ed:
1    2808
2    2485
3    1282
0     577
dtype: int64

There are 5328 null values for mother_ed

Secondary diagnosis

In [27]:

demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None

In [28]:

demographic.secondary_diagnosis.value_counts()

Out[28]:

0    9132
1    2121
dtype: int64

In [29]:

demographic.secondary_diagnosis.mean()

Out[29]:

0.1884830711810184

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [30]:

demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))

There are 3173 null values for premature_weeks

In [31]:

demographic.premature_weeks.value_counts()

Out[31]:

0     7889
2      486
4      324
12     180
6      159
10     125
8      104
14      38
16       2
dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [32]:

tech_cats = ["None", "Baha", "Hearing aid", "Cochlear"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.tech_left = np.abs(demographic.tech_left - 3)

Substitute valid missing values for hearing loss:

In [33]:

demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [34]:

demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

0=none
1=one ear
2=both ears.

In [35]:

demographic["baha"] = 0
demographic.baha = demographic.baha.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'baha'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'baha'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'baha'] = None
print("baha:")
print(demographic.drop_duplicates(subset='study_id').baha.value_counts())
print("There are {0} null values for baha".format(sum(demographic.baha.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))

baha:
0    3683
1     132
2      57
dtype: int64
There are 1476 null values for baha

hearing_aid:
2    1706
0    1615
1     529
dtype: int64
There are 1516 null values for hearing_aid

cochlear:
0    2493
2     805
1     574
dtype: int64
There are 1476 null values for cochlear
12480

Identify bilateral and bimodal individuals:

In [36]:

demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)

In [37]:

demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum()

Out[37]:

(2940, 4339, 1219)

In [175]:

demographic.drop_duplicates(subset='study_id')[['bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()

Out[175]:

bilateral_ci     805
bilateral_ha    1706
bimodal          334
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [38]:

demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))

There are 0 null values for tech

In [39]:

demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.baha==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.baha==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.baha==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()

Out[39]:

6    4339
3    2940
4    1219
0     680
1     470
2     294
8     168
7      19
5       8
dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [40]:

demographic.onset_1.unique()

Out[40]:

array([   6. ,    0. ,    3. ,    1. ,   25. ,    9. ,   13. ,   26. ,
          nan,   15. ,    2. ,   23. ,    7. ,   11. ,   24. ,   17. ,
         36. ,   28. ,   14. ,   48. ,   12. ,   29. ,   20. ,   27. ,
         22. ,    5. ,    4. ,   60. ,   32. ,   19. ,   18. ,   52. ,
         42. ,   21. ,   16. ,   30. ,    8. ,   10. ,  140. ,   61. ,
         66. ,   44. ,   41. ,   40. ,   49. ,   86. ,   33. ,  126. ,
          1.5,   85. ,   51. ,    2.5,   67. ,   39. ,   62. ,  133. ,
         38. ,  103. ,   54. ,   35. ,   43. ,   87. ,   83. ,   76. ,
         50. ,   37. ,  116. ,   68. ,   72. ,   92. ,   34. ,   57. ,
         97. ,   71. ,   55. ,   46. ,   65. ,   78. ,   45. ,   31. ,
        107. ,   64. ,   74. ,   77. ,   88. ,   81. ,   84. ,   80. ,
         53. ,   59. ,    0.5,   56. ,   98. ,   47. ,   58. ,   75. ,
         70. ,  119. ,   63. ,  154. ,   89. ,  152. ])

In [41]:

# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [42]:

demographic.age_diag.isnull().sum()

Out[42]:

In [43]:

demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})

In [44]:

import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')

Child has another diagnosed disability

In [45]:

demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None

In [46]:

# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [47]:

demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [48]:

races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column

_race:
0    6523
2    2011
1    1156
3     861
6     587
8     463
7     219
4      58
5      25
dtype: int64
race:
0    6523
2    2011
1    1156
4    1133
3     861
dtype: int64
There are 796 null values for race

Recode implant technology variables

In [49]:

tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)

In [50]:

# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)

In [51]:

demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013',
                                   '0000-0000': np.nan})

In [53]:

demographic.age_amp.hist()

Out[53]:

<matplotlib.axes._subplots.AxesSubplot at 0x1199b3438>

Cleaning expressive vocabulary dataset¶

We converted the expressive vocabulary dataset to "long" format:

In [54]:

# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]

There are 0 null values for test_type

In [55]:

expressive.test_type.value_counts()

Out[55]:

EVT               3113
EOWPVT            2305
EOWPVT and EVT     120
dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [56]:

expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [57]:

expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [58]:

expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()

Out[58]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
0	0101-2003-0101	initial_assessment_arm_1	58	EOWPVT	0101	54	Expressive Vocabulary
2	0101-2003-0101	year_2_complete_71_arm_1	84	EOWPVT	0101	80	Expressive Vocabulary
5	0101-2003-0101	year_5_complete_71_arm_1	90	EOWPVT	0101	113	Expressive Vocabulary
14	0101-2004-0101	year_2_complete_71_arm_1	90	EOWPVT	0101	53	Expressive Vocabulary
15	0101-2004-0101	year_3_complete_71_arm_1	87	EOWPVT	0101	66	Expressive Vocabulary

Cleaning receptive vocabulary dataset¶

We converted the receptive vocabulary data table to "long" format:

In [59]:

receptive.columns

Out[59]:

Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')

In [60]:

# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]

There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [61]:

receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [62]:

receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]

In [63]:

print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))

There are 28 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [64]:

receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()

Out[64]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
2	0101-2003-0101	year_2_complete_71_arm_1	90	PPVT	0101	80	Receptive Vocabulary
5	0101-2003-0101	year_5_complete_71_arm_1	101	ROWPVT	0101	113	Receptive Vocabulary
9	0101-2003-0102	initial_assessment_arm_1	55	PPVT	0101	44	Receptive Vocabulary
10	0101-2003-0102	year_1_complete_71_arm_1	80	PPVT	0101	54	Receptive Vocabulary
11	0101-2003-0102	year_2_complete_71_arm_1	101	PPVT	0101	68	Receptive Vocabulary

In [65]:

receptive.study_id.unique().shape

Out[65]:

(2619,)

Merge datasets¶

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [66]:

test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [184]:

lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')

In [185]:

lsl_dr.tail()

Out[185]:

	redcap_event_name	academic_year	male	sib	_mother_ed	father_ed	premature_age	...	synd_or_disab	academic_year_start	age_test	domain	school	score	test_name	test_type
31730	year_9_complete_71_arm_1	2012-2013	1	3	4	4	8	...	1	NaN	104	Articulation	0521	100	NaN	Goldman
31731	year_9_complete_71_arm_1	2012-2013	0	NaN	6	6	8	...	0	NaN	138	Articulation	0310	92	NaN	Goldman
31732	year_9_complete_71_arm_1	2012-2013	0	NaN	6	6	8	...	0	NaN	137	Receptive Vocabulary	0310	65	NaN	PPVT and ROWPVT
31733	year_9_complete_71_arm_1	2011-2012	1	3	6	6	8	...	0	NaN	160	Expressive Vocabulary	0102	92	NaN	EOWPVT
31734	year_9_complete_71_arm_1	2011-2012	1	3	6	6	8	...	0	NaN	162	Receptive Vocabulary	0102	84	NaN	ROWPVT

5 rows × 73 columns

In [186]:

lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()

Out[186]:

2013    6742
2012    6577
2011    5159
2010    4418
nan     3133
2009    2356
2014     984
2008     821
2007     531
2006     344
2005     276
2004     172
2003      90
2002      47
2001      35
1998      16
1999      15
2000      12
1997       6
1995       1
dtype: int64

In [190]:

current_year_only = True

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']

In [192]:

expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

In [193]:

expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [194]:

if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')

In [195]:

lsl_dr.shape

Out[195]:

(6742, 73)

In [196]:

lsl_dr.study_id.unique().shape

Out[196]:

(2222,)

In [197]:

demographic.study_id.unique().shape

Out[197]:

(4898,)

Convert score to floating-point number

In [198]:

lsl_dr.score = lsl_dr.score.astype(float)

Plots of Demographic Data¶

In [199]:

plot_color = "#64AAE8"

In [200]:

def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index(1)
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()

In [201]:

unique_students = demographic.drop_duplicates('study_id')

In [202]:

unique_students.shape

Out[202]:

(4898, 67)

In [203]:

unique_students.age.describe()

Out[203]:

count    4387.000000
mean       30.953271
std        28.380353
min         0.000000
25%         9.000000
50%        25.000000
75%        42.000000
max       298.000000
Name: age, dtype: float64

In [204]:

plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)

In [205]:

plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)

In [206]:

unique_students.prim_lang.count()

Out[206]:

In [207]:

plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)

In [208]:

unique_students.sib.count()

Out[208]:

In [209]:

amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))

There are 4563 null values for age_amp

In [210]:

age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))

In [211]:

age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))

In [212]:

age_amp_counts.sum()

Out[212]:

In [213]:

unique_students.age_amp.max()

Out[213]:

666.0

In [214]:

(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')

Out[214]:

<matplotlib.text.Text at 0x119135c50>

In [215]:

plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)

In [216]:

plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))

In [217]:

f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)

In [218]:

unique_students.tech_right.count()

Out[218]:

In [219]:

unique_students.tech_left.count()

Out[219]:

In [220]:

degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)

In [221]:

_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');

In [222]:

unique_students.degree_hl_as.count()

Out[222]:

In [223]:

type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)

In [224]:

plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)

In [225]:

unique_students.type_hl_ad.count()

Out[225]:

In [226]:

unique_students.type_hl_as.count()

Out[226]:

In [227]:

type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)

In [228]:

demographic[demographic.study_id=='1147-2010-0064']

Out[228]:

	redcap_event_name	academic_year	sib	_mother_ed	father_ed	premature_age	...	bilateral_ha	bimodal	implant_category	age_diag	sex	academic_year_start
11679	initial_assessment_arm_1	2010-2011	1	3	3	8	...	True	False	6	51	Female	NaN
11680	year_1_complete_71_arm_1	2011-2012	1	3	3	8	...	True	False	6	51	Female	NaN
11681	year_2_complete_71_arm_1	2012-2013	1	3	3	8	...	True	False	6	51	Female	NaN
11682	year_3_complete_71_arm_1	2013-2014	1	3	3	8	...	True	False	6	51	Female	NaN

4 rows × 67 columns

In [229]:

receptive[receptive.study_id=='1147-2010-0064']

Out[229]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
11679	1147-2010-0064	initial_assessment_arm_1	96	PPVT	1147	63	Receptive Vocabulary
11680	1147-2010-0064	year_1_complete_71_arm_1	91	PPVT	1147	73	Receptive Vocabulary
11681	1147-2010-0064	year_2_complete_71_arm_1	93	PPVT	1147	85	Receptive Vocabulary

In [230]:

lsl_dr[lsl_dr.study_id=='1147-2010-0064']

Out[230]:

	redcap_event_name	academic_year	hl	male	_race	prim_lang	sib	_mother_ed	father_ed	premature_age	...	known_synd	synd_or_disab	race	academic_year_start	age_test	domain	school	score	test_name	test_type
23329	year_3_complete_71_arm_1	2013-2014	0	0	0	0	1	3	3	8	...	0	0	0	2013	NaN	NaN	NaN	NaN	NaN	NaN

1 rows × 73 columns

In [231]:

unique_students.type_hl_ad.count()

Out[231]:

In [232]:

receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape

Out[232]:

(2619,)

In [233]:

demographic.study_id.unique().shape

Out[233]:

(4898,)

In [234]:

receptive.study_id.unique().shape

Out[234]:

(2619,)

In [235]:

lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape

Out[235]:

(1178,)

In [236]:

receptive_ids = receptive.study_id.unique()

In [237]:

demographic_ids = demographic.study_id.unique()

In [238]:

[s for s in receptive_ids if s not in demographic_ids]

Out[238]:

[]

In [239]:

def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    return summary[['Sample Size','Mean','SD','Min','Max']]

In [240]:

receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary

Out[240]:

	Sample Size	Mean	SD	Min	Max
age_test
2	67	97.641791	18.427336	44	144
3	247	96.113360	17.888865	47	139
4	290	92.762069	20.776166	0	140
5	201	89.457711	19.192172	0	130
6	119	89.058824	16.842387	51	137
7	70	84.885714	18.698634	43	124
8	51	82.078431	15.620299	46	114
9	40	83.550000	17.485452	53	120
10	46	76.847826	18.447484	20	109
11	138	81.920290	18.512202	29	132

In [241]:

receptive_summary.describe()

Out[241]:

	Sample Size	Mean	SD	Min	Max
count	10.000000	10.000000	10.000000	10.000000	10.000000
mean	126.900000	87.431602	18.189100	33.300000	128.900000
std	90.298825	6.727914	1.381796	20.199285	11.789355
min	40.000000	76.847826	15.620299	0.000000	109.000000
25%	55.000000	82.446324	17.586305	22.250000	121.000000
50%	94.500000	86.972269	18.437410	43.500000	131.000000
75%	185.250000	91.935980	18.652026	46.750000	138.500000
max	290.000000	97.641791	20.776166	53.000000	144.000000

In [242]:

receptive_summary['Sample Size'].sum()

Out[242]:

In [243]:

receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)

Out[243]:

(-0.5, 9.5)

In [244]:

expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary

Out[244]:

	Sample Size	Mean	SD	Min	Max
age_test
2	61	99.918033	18.484494	55	134
3	244	96.434426	21.047463	42	145
4	282	95.191489	21.178084	0	139
5	203	91.073892	20.455686	0	133
6	117	90.034188	19.400467	35	129
7	75	85.346667	15.480147	52	117
8	50	85.780000	14.044085	46	115
9	40	87.575000	14.101987	55	110
10	45	84.088889	15.545859	44	110
11	135	88.422222	15.986686	52	137

In [245]:

expressive_summary['Sample Size'].sum()

Out[245]:

In [272]:

expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 400)
else:
    plt.ylim(0, 1400)

In [247]:

articulation_summary = score_summary("Articulation")
articulation_summary

Out[247]:

	Sample Size	Mean	SD	Min	Max
age_test
2	42	88.547619	17.935117	50	122
3	217	85.244240	19.399319	40	125
4	281	84.217082	22.830568	0	121
5	178	83.174157	21.460462	40	116
6	121	80.685950	22.692155	40	110
7	67	79.716418	22.760150	3	108
8	44	77.500000	19.553742	40	107
9	27	85.185185	18.193484	40	108
10	31	82.354839	19.142533	40	105
11	65	83.969231	20.182735	39	105

In [248]:

articulation_summary['Sample Size'].sum()

Out[248]:

In [249]:

sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

Language scores

In [250]:

lsl_dr.domain.unique()

Out[250]:

array([nan, 'Language', 'Articulation', 'Receptive Vocabulary',
       'Expressive Vocabulary'], dtype=object)

In [251]:

lsl_dr.test_type.unique()

Out[251]:

array([nan, 'receptive', 'expressive', 'Goldman', 'ROWPVT', 'EOWPVT',
       'EVT', 'PPVT', 'Arizonia', 'PPVT and ROWPVT',
       'Arizonia and Goldman', 'EOWPVT and EVT'], dtype=object)

In [252]:

receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary

Out[252]:

	Sample Size	Mean	SD	Min	Max
age_test
2	169	89.597633	22.966673	50	136
3	234	88.807692	18.895051	50	139
4	265	86.524528	19.317139	50	145
5	170	85.494118	20.003623	47	140
6	92	80.108696	20.074836	40	121
7	48	77.479167	18.109583	47	120
8	35	74.371429	19.395399	40	115
9	10	67.600000	18.337575	40	104
10	10	77.800000	16.771669	41	99
11	27	77.740741	18.289536	40	107

In [253]:

receptive_language_summary['Sample Size'].sum()

Out[253]:

In [254]:

sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [255]:

expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary

Out[255]:

	Sample Size	Mean	SD	Min	Max
age_test
2	168	91.125000	18.812401	50	150
3	234	84.739316	16.472776	53	139
4	265	81.535849	18.828966	48	136
5	173	80.508671	19.248261	48	137
6	103	76.689320	21.347741	40	140
7	58	71.120690	20.664469	40	114
8	37	66.783784	19.894353	40	116
9	10	62.700000	22.410563	40	106
10	10	80.800000	20.595577	40	107
11	26	73.769231	20.465205	40	112

In [256]:

expressive_language_summary['Sample Size'].sum()

Out[256]:

In [257]:

sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [258]:

(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')

Out[258]:

<matplotlib.text.Text at 0x11bac7518>

In [259]:

(unique_students.age/12.).describe()

Out[259]:

count    4387.000000
mean        2.579439
std         2.365029
min         0.000000
25%         0.750000
50%         2.083333
75%         3.500000
max        24.833333
Name: age, dtype: float64

In [260]:

def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})

In [261]:

audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())

In [262]:

plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')

Out[262]:

<matplotlib.text.Text at 0x11bb42128>

In [263]:

slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())

In [264]:

plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')

Out[264]:

<matplotlib.text.Text at 0x11e02e5f8>

In [265]:

sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())

In [266]:

plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')

Out[266]:

<matplotlib.text.Text at 0x11917fef0>

In [267]:

lsl_dr.degree_hl.dropna().value_counts()

Out[267]:

6    2949
3     814
4     812
5     708
2     378
0     176
1      53
dtype: int64

In [268]:

ax = lsl_dr.degree_hl.hist(bins=7)

In [269]:

diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)

Out[269]:

<matplotlib.axes._subplots.AxesSubplot at 0x11bf02f28>

In [270]:

(lsl_dr.age_int<6).mean()

Out[270]:

0.22619400771284484

In [271]:

(lsl_dr.age<6).mean()

Out[271]:

0.14224265796499555