In [1]:

# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:

from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)

In [3]:

metadata = lsl_dr_project.export_metadata()

In [4]:

# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:

articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None,
                                                                                                'na_values':[999, 9999]})

In [6]:

records = lsl_dr_project.export_records(fields=articulation_fields)

In [7]:

print(records[0]['study_id'])

0101-2002-0101

In [8]:

expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})

In [9]:

receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})

In [10]:

language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf',
                   'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
                   'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
                   'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})

In [11]:

demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year', 'academic_year_rv',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})

In [12]:

demographic_raw.academic_year_rv.value_counts()

Out[12]:

2013.0    2501
2012.0    2429
2014.0    2170
2011.0    1901
2010.0    1609
2009.0    1021
2015.0     931
2008.0     436
2007.0     277
2006.0     189
2005.0     138
2004.0      89
2003.0      65
2002.0      36
2001.0      24
2000.0      12
1999.0      12
1998.0       9
15.0         3
1997.0       2
1995.0       1
Name: academic_year_rv, dtype: int64

In [13]:

demographic_raw[demographic_raw.study_id=='1147-2010-0064']

Out[13]:

	study_id	redcap_event_name	academic_year	academic_year_rv	gender	race	prim_lang	sib	mother_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
14665	1147-2010-0064	initial_assessment_arm_1	2010-2011	2010.0	0.0	0.0	0.0	1.0	3.0	...	3.0	6.0	65.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
14666	1147-2010-0064	year_1_complete_71_arm_1	2011-2012	2011.0	NaN	NaN	NaN	NaN	NaN	...	3.0	5.0	77.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN
14667	1147-2010-0064	year_2_complete_71_arm_1	2012-2013	2012.0	NaN	NaN	NaN	NaN	NaN	...	3.0	5.0	89.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN
14668	1147-2010-0064	year_3_complete_71_arm_1	2013-2014	2013.0	NaN	NaN	NaN	NaN	NaN	...	4.0	5.0	101.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN

4 rows × 47 columns

Attendance information¶

Several fields in the demographic data have missing values.

In [14]:

demographic_raw.head()

Out[14]:

	study_id	redcap_event_name	academic_year	academic_year_rv	gender	race	prim_lang	sib	mother_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
0	0101-2002-0101	initial_assessment_arm_1	2002-2003	2002.0	0.0	0.0	0.0	1.0	6.0	...	2.0	2.0	54.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN
1	0101-2002-0101	year_1_complete_71_arm_1	2003-2004	2003.0	NaN	NaN	NaN	NaN	NaN	...	4.0	4.0	80.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN
2	0101-2002-0101	year_2_complete_71_arm_1	2004-2005	2004.0	NaN	NaN	NaN	NaN	NaN	...	4.0	4.0	80.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN
3	0101-2002-0101	year_3_complete_71_arm_1	2005-2006	2005.0	NaN	NaN	NaN	NaN	NaN	...	5.0	5.0	96.0	3.0	NaN	NaN	NaN	NaN	NaN	NaN
4	0101-2002-0101	year_4_complete_71_arm_1	2006-2007	2006.0	NaN	NaN	NaN	NaN	NaN	...	5.0	5.0	109.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 47 columns

We can fill missing values forward from previous observation (by study_id)

In [15]:

demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()

Random check to make sure this worked

In [16]:

demographic[demographic.study_id=='1147-2010-0064']

Out[16]:

	redcap_event_name	academic_year	academic_year_rv	sib	mother_ed	father_ed	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
14665	initial_assessment_arm_1	2010-2011	2010.0	1.0	3.0	3.0	...	6.0	65.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
14666	year_1_complete_71_arm_1	2011-2012	2011.0	1.0	3.0	3.0	...	5.0	77.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
14667	year_2_complete_71_arm_1	2012-2013	2012.0	1.0	3.0	3.0	...	5.0	89.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
14668	year_3_complete_71_arm_1	2013-2014	2013.0	1.0	3.0	3.0	...	5.0	101.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064

4 rows × 47 columns

Demographic data without missing values:

In [17]:

demographic.head()

Out[17]:

	redcap_event_name	academic_year	academic_year_rv	gender	race	sib	mother_ed	father_ed	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
0	initial_assessment_arm_1	2002-2003	2002.0	0.0	0.0	1.0	6.0	6.0	...	2.0	54.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	0101-2002-0101
8001	initial_assessment_arm_1	2009-2010	2009.0	0.0	0.0	1.0	5.0	3.0	...	5.0	138.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-2005-1814
7995	initial_assessment_arm_1	2009-2010	2009.0	0.0	6.0	0.0	4.0	3.0	...	4.0	78.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-2005-1756
7990	initial_assessment_arm_1	2009-2010	2009.0	0.0	1.0	1.0	3.0	4.0	...	4.0	77.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-2005-1744
7987	initial_assessment_arm_1	2009-2010	2009.0	1.0	1.0	2.0	6.0	6.0	...	4.0	118.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-2005-1741

5 rows × 47 columns

Cleaning languge dataset¶

5 language measures:

3 versions of CELF
PLS
- pls_ac_rs: PLS: Auditory Comprehension Raw Score
- pls_ac_ss: PLS: Auditory Comprehension Standard Score
- pls_ec_rs: PLS: Expressive Communication Raw Score
- pls_ec_ss: PLS: Expressive Communication Standard Score
- pls_tl_rs: PLS: Total Language Score Standard Score Total
- pls_tl_ss: PLS: Total Language Score Standard Score
OWLS
- age_test_owls: Age at time of testing (OWLS)
- owls_lc_rs: OWLS: Listening Comprehension Raw Score
- owls_lc_ss: OWLS: Listening Comprehension Standard Score
- owls_oe_rs: OWLS: Oral Expression Raw Score
- owls_oe_ss: OWLS: Oral Expression Standard Score
- owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
- owls_oc_ss: OWLS: Oral Composite Standard Score
- owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
- owls_wes_as: OWLS: Written Expression Scale Ability Score
- owls_wes_ss: OWLS: Written Expression Scale Standard Score
- owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
- owls_lcss: OWLS: Language Composite Standard Score

In [18]:

# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))

test_type  expressive  receptive
test_name                       
CELF-4            627        545
CELF-P2          1511       1516
OWLS             1093       1099
PLS              3572       3584
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [19]:

language["school"] = language.study_id.str.slice(0,4)

In [20]:

language_subtest = language[["study_id", "redcap_event_name", "score", "test_type", 
                             "test_name", "school", "age_test", 
                             'celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']]

In [21]:

language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()

Out[21]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain
0	0101-2002-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language
5	0101-2002-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language

Cleaning articulation dataset¶

We converted the articulation dataset into a "long" format:

In [22]:

# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]

Goldman                 5437
Arizonia                 503
Arizonia and Goldman      73
Name: test_type, dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [23]:

articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [24]:

articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())

count    6011.000000
mean       68.857095
std        30.613506
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [25]:

articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()

Out[25]:

	study_id	redcap_event_name	test_type	score	school	age_test	domain
1	0101-2002-0101	year_1_complete_71_arm_1	Goldman	78.0	0101	80.0	Articulation
9	0101-2003-0102	initial_assessment_arm_1	Goldman	72.0	0101	44.0	Articulation
10	0101-2003-0102	year_1_complete_71_arm_1	Goldman	97.0	0101	54.0	Articulation
14	0101-2004-0101	year_2_complete_71_arm_1	Goldman	75.0	0101	53.0	Articulation
15	0101-2004-0101	year_3_complete_71_arm_1	Goldman	80.0	0101	66.0	Articulation

Cleaning demographic dataset¶

We excluded unwanted columns and rows for which age, gender or race were missing:

In [26]:

# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [27]:

demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))

False    11986
True      2688
Name: non_english, dtype: int64
There are 622 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

0=no high school diploma
1=high school
2=undergraduate
3=graduate

Category 6 (unknown) was recoded as missing.

In [28]:

demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))

_mother_ed:
6.0    5340
4.0    3140
3.0    2127
5.0    1696
2.0    1489
1.0     498
0.0     222
Name: _mother_ed, dtype: int64
mother_ed:
1.0    3616
2.0    3140
3.0    1696
0.0     720
Name: mother_ed, dtype: int64

There are 6124 null values for mother_ed

Secondary diagnosis

In [29]:

demographic.shape

Out[29]:

(15296, 49)

In [30]:

demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None

In [31]:

demographic.secondary_diagnosis.value_counts()

Out[31]:

0.0    11224
1.0     2526
Name: secondary_diagnosis, dtype: int64

In [32]:

demographic.secondary_diagnosis.mean()

Out[32]:

0.18370909090909091

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [33]:

demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))

There are 3394 null values for premature_weeks

In [34]:

demographic.premature_weeks.value_counts()

Out[34]:

0.0     10190
2.0       609
4.0       386
12.0      202
6.0       186
10.0      160
8.0       124
14.0       42
16.0        3
Name: premature_weeks, dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [35]:

demographic.tech_ad.value_counts()

Out[35]:

1.0     5221
0.0     4497
7.0     1588
5.0     1056
2.0      529
6.0      433
8.0       76
9.0       70
4.0       31
3.0       26
10.0       4
Name: tech_ad, dtype: int64

In [36]:

tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None

In [37]:

demographic.tech_left.value_counts()

Out[37]:

2.0    6919
3.0    4579
0.0    1925
4.0      61
1.0      18
Name: tech_left, dtype: int64

In [38]:

demographic.tech_right.value_counts()

Out[38]:

2.0    6841
3.0    5006
0.0    1588
4.0      70
1.0      26
Name: tech_right, dtype: int64

Substitute valid missing values for hearing loss:

In [39]:

demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [40]:

demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

0=none
1=one ear
2=both ears.

In [41]:

demographic.columns

Out[41]:

Index(['redcap_event_name', 'academic_year', 'academic_year_rv', 'hl', 'male',
       'race', 'prim_lang', 'sib', '_mother_ed', 'father_ed', 'premature_age',
       'onset_1', 'age_amp', 'age_int', 'age', 'synd_cause', 'etiology',
       'etiology_2', 'hearing_changes', 'ae', 'ad_250', 'ad_500',
       'degree_hl_ad', 'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500',
       'degree_hl_as', 'type_hl_as', 'tech_as', 'age_ci_2', 'time',
       'age_disenrolled', 'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo',
       'fam_age', 'family_inv', 'att_days_sch', 'att_days_st2_417',
       'att_days_hr', 'demo_ses', 'school_lunch', 'medicaid', 'study_id',
       'non_english', 'mother_ed', 'secondary_diagnosis', 'premature_weeks',
       'tech_right', 'tech_left', 'degree_hl'],
      dtype='object')

In [42]:

demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))

oad:
0    4770
1       4
Name: oad, dtype: int64
There are 1711 null values for OAD

hearing_aid:
2    2249
0    1669
1     824
Name: hearing_aid, dtype: int64
There are 1765 null values for hearing_aid

cochlear:
0    3203
2     935
1     636
Name: cochlear, dtype: int64
There are 1711 null values for cochlear
15296

Identify bilateral and bimodal individuals:

In [43]:

demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)

In [44]:

demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()

Out[44]:

(3718, 5632, 1437, 2149)

In [45]:

demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()

Out[45]:

unilateral_ci     636
bilateral_ci      935
bilateral_ha     2249
bimodal           384
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [46]:

demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))

There are 0 null values for tech

In [47]:

demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()

Out[47]:

6    5632
3    3718
4    1437
1    1034
0     692
8      13
2      12
7       5
5       1
Name: implant_category, dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [48]:

demographic.onset_1.unique()

Out[48]:

array([  15. ,   80. ,   14. ,   62. ,    2. ,   49. ,   19. ,    9. ,
         18. ,    4. ,    0. ,   10. ,   12. ,    1. ,   31. ,   16. ,
         26. ,   61. ,   46. ,   24. ,   36. ,   21. ,   52. ,   30. ,
          7. ,   51. ,    8. ,    3. ,    6. ,   17. ,   50. ,   23. ,
         42. ,   37. ,   33. ,   60. ,   13. ,    nan,   22. ,   28. ,
         82. ,   34. ,   35. ,   38. ,   95. ,    5. ,   59. ,   25. ,
         48. ,    1.5,   41. ,   53. ,   88. ,   29. ,   27. ,   39. ,
         65. ,   64. ,   47. ,   79. ,   97. ,   96. ,  107. ,   77. ,
         74. ,   11. ,   84. ,   20. ,   45. ,   32. ,   81. ,   55. ,
         58. ,   70. ,  154. ,   54. ,   57. ,   72. ,   43. ,   83. ,
         78. ,  116. ,   40. ,   44. ,  119. ,   63. ,   66. ,  140. ,
         56. ,   87. ,   76. ,   68. ,   92. ,   86. ,  126. ,   85. ,
        133. ,  103. ,   67. ,   71. ,    2.5,   98. ,   75. ,    0.5,
         89. ,  152. ])

In [49]:

# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [50]:

demographic.age_diag.isnull().sum()

Out[50]:

In [51]:

demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})

In [52]:

import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')

In [162]:

unique_students.shape

Out[162]:

(5522, 64)

Child has another diagnosed disability

In [53]:

demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None

In [54]:

# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [55]:

demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [56]:

races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column

_race:
0.0    7955
2.0    2649
1.0    1407
3.0    1074
6.0     751
8.0     542
7.0     241
4.0      66
5.0      37
Name: _race, dtype: int64
race:
0.0    7955
2.0    2649
1.0    1407
4.0    1396
3.0    1074
Name: race, dtype: int64
There are 815 null values for race

Recode implant technology variables

In [57]:

tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)

In [58]:

# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)

In [59]:

demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012',
                                   '0000-0000': np.nan}).str.replace('*', '-').unique()

Out[59]:

array(['2002-2003', '2009-2010', '2011-2012', '2009-2011', '2006-2007',
       '2007-2008', '2008-2009', '2014-2015', '2013-2014', '2012-2013',
       nan, '2015-2016', '2010-2011', '2014', '2005-2006', '2004-2005',
       '2003-2004',
       '2010-2011                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   2010-2011',
       '2011', '2010', '2009', '2012', '2013', '1995-1996', '1999-2000',
       '2000-2001', '1998-1999', '1997-1998', '2001-2002', '2014-15',
       '2015-2015', '2015', '2041-2015', '2015-2106', '22014-2015',
       '2014-1015'], dtype=object)

In [60]:

demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
                          '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
                          '2015-2015': '2014-2015', '2009-2011': '2009-2010',
                                   '0000-0000': np.nan}).str.replace('*', '-')

Removed entries that don't contain dashes

In [61]:

demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 
                'academic_year'] = np.nan

In [62]:

demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))

In [63]:

demographic.age_amp.hist()

Out[63]:

<matplotlib.axes._subplots.AxesSubplot at 0x11a63eeb8>

Cleaning expressive vocabulary dataset¶

We converted the expressive vocabulary dataset to "long" format:

In [64]:

# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]

There are 0 null values for test_type

In [65]:

expressive.test_type.value_counts()

Out[65]:

EVT               3881
EOWPVT            2784
EOWPVT and EVT     149
Name: test_type, dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [66]:

expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [67]:

expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [68]:

expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()

Out[68]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
0	0101-2002-0101	initial_assessment_arm_1	58.0	EOWPVT	0101	54.0	Expressive Vocabulary
2	0101-2002-0101	year_2_complete_71_arm_1	84.0	EOWPVT	0101	80.0	Expressive Vocabulary
5	0101-2002-0101	year_5_complete_71_arm_1	90.0	EOWPVT	0101	113.0	Expressive Vocabulary
14	0101-2004-0101	year_2_complete_71_arm_1	90.0	EOWPVT	0101	53.0	Expressive Vocabulary
15	0101-2004-0101	year_3_complete_71_arm_1	87.0	EOWPVT	0101	66.0	Expressive Vocabulary

Cleaning receptive vocabulary dataset¶

We converted the receptive vocabulary data table to "long" format:

In [69]:

receptive.columns

Out[69]:

Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')

In [70]:

# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]

There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [71]:

receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [72]:

receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]

In [73]:

print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))

There are 23 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [74]:

receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()

Out[74]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
2	0101-2002-0101	year_2_complete_71_arm_1	90.0	PPVT	0101	80.0	Receptive Vocabulary
5	0101-2002-0101	year_5_complete_71_arm_1	101.0	ROWPVT	0101	113.0	Receptive Vocabulary
9	0101-2003-0102	initial_assessment_arm_1	55.0	PPVT	0101	44.0	Receptive Vocabulary
10	0101-2003-0102	year_1_complete_71_arm_1	80.0	PPVT	0101	54.0	Receptive Vocabulary
11	0101-2003-0102	year_2_complete_71_arm_1	101.0	PPVT	0101	68.0	Receptive Vocabulary

In [75]:

receptive.study_id.unique().shape

Out[75]:

(3108,)

Merge datasets¶

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [76]:

test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [77]:

lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')

In [78]:

lsl_dr.tail()

Out[78]:

	redcap_event_name	academic_year	academic_year_rv	male	_race	prim_lang	sib	_mother_ed	father_ed	...	sex	race	age_test	domain	school	score	test_name	test_type
39154	year_9_complete_71_arm_1	2011-2012	2011.0	1.0	0.0	0.0	3.0	6.0	6.0	...	Male	0.0	162	Receptive Vocabulary	0102	84	NaN	ROWPVT
39155	year_9_complete_71_arm_1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	203	Expressive Vocabulary	1147	95	NaN	EVT
39156	year_9_complete_71_arm_1	NaN	NaN	0.0	2.0	0.0	1.0	6.0	6.0	...	Female	2.0	119	Articulation	0624	102	NaN	Goldman
39157	year_9_complete_71_arm_1	NaN	NaN	0.0	2.0	0.0	1.0	6.0	6.0	...	Female	2.0	119	Expressive Vocabulary	0624	96	NaN	EVT
39158	year_9_complete_71_arm_1	NaN	NaN	0.0	2.0	0.0	1.0	6.0	6.0	...	Female	2.0	119	Receptive Vocabulary	0624	82	NaN	PPVT

5 rows × 74 columns

In [79]:

lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()

Out[79]:

2013    6952
2012    6641
2014    6144
2011    5256
2010    4457
nan     3164
2009    2502
2015    1646
2008     827
2007     536
2006     345
2005     286
2004     172
2003      90
2002      47
2001      37
1998      16
1999      16
2000      12
1997       6
2201       5
2041       1
1995       1
Name: academic_year_start, dtype: int64

In [80]:

current_year_only = False

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']

In [81]:

expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

In [82]:

expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [83]:

if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')

In [163]:

lsl_dr.shape

Out[163]:

(39159, 79)

In [164]:

lsl_dr.study_id.unique().shape

Out[164]:

(5898,)

In [86]:

demographic.study_id.unique().shape

Out[86]:

(5898,)

Convert score to floating-point number

In [87]:

lsl_dr.score = lsl_dr.score.astype(float)

In [88]:

lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'

In [89]:

lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)

In [90]:

lsl_dr.domain.dropna().unique()

Out[90]:

array(['Expressive Vocabulary', 'Language', 'Articulation',
       'Receptive Vocabulary'], dtype=object)

In [91]:

lsl_dr.groupby('tech_class').prim_lang.mean().round(2)

Out[91]:

tech_class
Bilateral CI    0.45
Bilateral HA    0.58
Bimodal         0.50
Name: prim_lang, dtype: float64

In [92]:

lsl_dr['non_profound'] = lsl_dr.degree_hl<6

In [93]:

lsl_dr.groupby('tech_class').non_profound.mean().round(2)

Out[93]:

tech_class
Bilateral CI    0.08
Bilateral HA    0.87
Bimodal         0.31
Name: non_profound, dtype: float64

In [138]:

lsl_dr['age_test_year'] = -999
lsl_dr.loc[lsl_dr.age_test.notnull(), 'age_test_year'] = (lsl_dr.age_test/12).dropna().astype(int)
lsl_dr.loc[lsl_dr.age_test_year==-999, 'age_test_year'] = np.nan

In [139]:

f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
    plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
    plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
    ax.set_ylim(40, 120)
    ax.set_xticks(range(2,7))
    ax.set_title(dom)

PPVT¶

In [141]:

ppvt_only = lsl_dr[lsl_dr.test_type=='PPVT']
ppvt_only.age_year.hist()

Out[141]:

<matplotlib.axes._subplots.AxesSubplot at 0x11ad41be0>

In [142]:

ppvt_345 = ppvt_only[ppvt_only.age_test_year.isin([3,4,5])]

In [143]:

ppvt_345.score.describe()

Out[143]:

count    2576.000000
mean       92.463509
std        20.127618
min        20.000000
25%        79.000000
50%        94.000000
75%       107.000000
max       153.000000
Name: score, dtype: float64

In [145]:

ppvt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})

Out[145]:

	score
	min	max	median	count_nonzero
age_test_year
3.0	36.0	153.0	95.0	873.0
4.0	20.0	149.0	94.0	936.0
5.0	20.0	142.0	91.0	767.0

EVT¶

In [146]:

lsl_dr.test_type.value_counts()

Out[146]:

expressive              6803
receptive               6744
Goldman                 5437
PPVT                    4445
EVT                     3881
EOWPVT                  2784
ROWPVT                  2346
Arizonia                 503
PPVT and ROWPVT          199
EOWPVT and EVT           149
Arizonia and Goldman      73
Name: test_type, dtype: int64

In [147]:

evt_only = lsl_dr[lsl_dr.test_type=='EVT']
evt_only.age_test_year.hist()

Out[147]:

<matplotlib.axes._subplots.AxesSubplot at 0x11ad78470>

In [148]:

evt_345 = evt_only[evt_only.age_test_year.isin([3,4,5])]

In [149]:

evt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})

Out[149]:

	score
	min	max	median	count_nonzero
age_test_year
3.0	19.0	147.0	100.0	767.0
4.0	20.0	146.0	99.0	813.0
5.0	20.0	150.0	97.0	644.0

PLS¶

In [151]:

pls_only = (language[(language.test_name=='PLS')]
           .convert_objects(convert_numeric=True))
pls_only['age_year'] = np.floor(pls_only.age_test/12).astype(int)
pls_345 = pls_only[pls_only.age_year.isin([3,4,5])]

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app

In [152]:

(pls_345.assign(normal_limits=pls_345.score>=85).groupby(['age_year', 'test_type'])
             .agg({'score':[min, max, np.median, len], 
                    'normal_limits': np.mean}))

Out[152]:

		score				normal_limits
		min	max	median	len	mean
age_year	test_type
3	expressive	50.0	145.0	78.0	813.0	0.355474
3	receptive	50.0	140.0	80.0	813.0	0.404674
4	expressive	50.0	141.0	73.0	602.0	0.284053
4	receptive	50.0	136.0	77.0	606.0	0.381188
5	expressive	50.0	138.0	68.0	304.0	0.259868
5	receptive	50.0	129.0	73.0	306.0	0.290850

CELF¶

In [153]:

celf_only = (language_subtest[(language_subtest.test_name=='CELF-P2')]
           .convert_objects(convert_numeric=True))
celf_only['age_year'] = np.floor(celf_only.age_test/12).astype(int)
celf_46 = celf_only[celf_only.age_year.isin([4,6])]

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app

In [154]:

subtests = ['celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']

In [155]:

(celf_46.groupby('age_year')
             .agg({st:np.median for st in subtests})).T

Out[155]:

age_year	4	6
celfp_wct_ss	10.0	8.0
celfp_ev_ss	8.0	5.0
celfp_wcr_ss	10.0	10.0
celfp_wce_ss	9.0	7.0
celfp_ss_ss	8.0	5.0
celfp_ws_ss	6.0	4.0
celfp_fd_ss	8.0	4.0
celfp_rs_ss	7.0	4.0
celfp_bc_ss	9.0	4.5

Proportions in normal range¶

In [108]:

def calc_norm_range(dataset):
    return (dataset.groupby('study_id').score.mean() >= 85).mean()

Mean score of each domain

In [156]:

calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') 
                       & (lsl_dr.test_type=='expressive')
                       & (lsl_dr.age_test_year.isin([3,4,5]))])

Out[156]:

0.40083217753120665

In [157]:

for year in range(2010, 2014):
    value = calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') 
                       & (lsl_dr.test_type=='receptive') & (lsl_dr.academic_year_rv==year)
                                  & (lsl_dr.age_test_year.isin([3,4,5]))]).round(2)
    print('{}: {}'.format(year, value))

In [158]:

calc_norm_range(lsl_dr[(lsl_dr.domain=='Receptive Vocabulary')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])

Out[158]:

0.63506493506493511

In [159]:

calc_norm_range(lsl_dr[(lsl_dr.domain=='Expressive Vocabulary')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])

Out[159]:

0.64257555847568992

In [160]:

calc_norm_range(lsl_dr[(lsl_dr.domain=='Articulation')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])

Out[160]:

0.49158249158249157

Summary statistics

In [189]:

(lsl_dr.groupby('study_id').male.first().dropna()==0).mean()

Out[189]:

0.46830858384643242

In [197]:

(lsl_dr.groupby('study_id').race.first().dropna()==0).mean()

Out[197]:

0.54349040789718761

In [204]:

(lsl_dr.groupby('study_id').non_english.first().dropna()==False).sum()

Out[204]:

In [208]:

lsl_dr.groupby('study_id').sib.first().dropna().count()

Out[208]:

In [213]:

lsl_dr.groupby('study_id').onset_1.first().dropna().count()

Out[213]:

In [215]:

lsl_dr.groupby('study_id').age_amp.first().dropna().median()

Out[215]:

8.0

In [218]:

lsl_dr.groupby('study_id').age_int.first().dropna().median()

Out[218]:

9.0

In [220]:

lsl_dr.groupby('study_id').age.first().dropna().count()

Out[220]:

In [247]:

_unique = lsl_dr.dropna(subset=['age_disenrolled', 'age']).groupby('study_id').first()
(_unique.age_disenrolled - _unique.age).count()

Out[247]:

In [254]:

synd_cause = lsl_dr.groupby('study_id').synd_cause.first().dropna()
synd_cause = synd_cause[synd_cause<3]

In [257]:

synd_cause.value_counts()/synd_cause.value_counts().sum()

Out[257]:

1.0    0.885766
0.0    0.091387
2.0    0.022847
Name: synd_cause, dtype: float64

In [262]:

etiology = lsl_dr.groupby('study_id').etiology.first().dropna()
etiology = etiology[etiology<3]

In [264]:

etiology.value_counts()/etiology.value_counts().sum()

Out[264]:

1.0    0.791393
0.0    0.163977
2.0    0.044630
Name: etiology, dtype: float64

In [267]:

lsl_dr['concerns'] = lsl_dr.etiology_2.replace({0:'none', 4:'none', 1:'mild', 2:'moderate', 3:'severe'})

In [270]:

lsl_dr.groupby('study_id').concerns.last().dropna().value_counts()

Out[270]:

none        3328
moderate     546
mild         436
severe       344
Name: concerns, dtype: int64

Plots of Demographic Data¶

In [271]:

plot_color = "#64AAE8"

In [272]:

def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index()
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()

In [273]:

unique_students = demographic.drop_duplicates('study_id')

In [274]:

unique_students.shape

Out[274]:

(5898, 68)

In [275]:

unique_students.age.describe()

Out[275]:

count    5381.000000
mean       29.302360
std        27.507899
min         0.000000
25%         8.000000
50%        24.000000
75%        40.000000
max       298.000000
Name: age, dtype: float64

In [276]:

plot_demo_data(unique_students.male, 
               ('Female', 'Male'), label_offset=20, color=plot_color)

In [277]:

plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)

In [278]:

unique_students.prim_lang.count()

Out[278]:

In [279]:

plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)

In [280]:

unique_students.sib.count()

Out[280]:

In [281]:

amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))

There are 4806 null values for age_amp

In [282]:

age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':

In [283]:

age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
plt.ylim(0,1000)
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))

In [284]:

age_amp_counts.sum()

Out[284]:

In [285]:

unique_students.age_amp.max()

Out[285]:

173.0

In [286]:

(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')

Out[286]:

<matplotlib.text.Text at 0x11d496198>

In [287]:

plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color, ylim=(0, 3000))

In [288]:

plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))

In [289]:

f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color)
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)

In [290]:

unique_students.tech_right.count()

Out[290]:

In [291]:

unique_students.tech_left.count()

Out[291]:

In [292]:

degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)

In [293]:

_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');

In [294]:

unique_students.degree_hl_as.count()

Out[294]:

In [295]:

type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)

In [296]:

plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)

In [297]:

unique_students.type_hl_ad.count()

Out[297]:

In [298]:

unique_students.type_hl_as.count()

Out[298]:

In [299]:

type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)

In [300]:

demographic[demographic.study_id=='1147-2010-0064']

Out[300]:

	redcap_event_name	academic_year	academic_year_rv	sib	_mother_ed	father_ed	...	bilateral_ci	bilateral_ha	bimodal	implant_category	age_diag	sex
14665	initial_assessment_arm_1	2010-2011	2010.0	1.0	3.0	3.0	...	False	True	False	6	51.0	Female
14666	year_1_complete_71_arm_1	2011-2012	2011.0	1.0	3.0	3.0	...	False	True	False	6	51.0	Female
14667	year_2_complete_71_arm_1	2012-2013	2012.0	1.0	3.0	3.0	...	False	True	False	6	51.0	Female
14668	year_3_complete_71_arm_1	2013-2014	2013.0	1.0	3.0	3.0	...	False	True	False	6	51.0	Female

4 rows × 68 columns

In [301]:

receptive[receptive.study_id=='1147-2010-0064']

Out[301]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
14665	1147-2010-0064	initial_assessment_arm_1	96.0	PPVT	1147	63.0	Receptive Vocabulary
14666	1147-2010-0064	year_1_complete_71_arm_1	91.0	PPVT	1147	73.0	Receptive Vocabulary
14667	1147-2010-0064	year_2_complete_71_arm_1	93.0	PPVT	1147	85.0	Receptive Vocabulary

In [302]:

lsl_dr[lsl_dr.study_id=='1147-2010-0064']

Out[302]:

	redcap_event_name	academic_year	academic_year_rv	sib	_mother_ed	father_ed	...	school	score	test_name	test_type	academic_year_start	tech_class	age_year	non_profound	age_test_year	concerns
5947	initial_assessment_arm_1	2010-2011	2010.0	1.0	3.0	3.0	...	1147	91.0	NaN	EVT	2010	Bilateral HA	4.0	True	5.0	NaN
5948	initial_assessment_arm_1	2010-2011	2010.0	1.0	3.0	3.0	...	1147	96.0	NaN	PPVT	2010	Bilateral HA	4.0	True	5.0	NaN
5949	initial_assessment_arm_1	2010-2011	2010.0	1.0	3.0	3.0	...	1147	101.0	PLS	receptive	2010	Bilateral HA	4.0	True	4.0	NaN
5950	initial_assessment_arm_1	2010-2011	2010.0	1.0	3.0	3.0	...	1147	87.0	PLS	expressive	2010	Bilateral HA	4.0	True	4.0	NaN
15880	year_1_complete_71_arm_1	2011-2012	2011.0	1.0	3.0	3.0	...	1147	86.0	NaN	EVT	2011	Bilateral HA	4.0	True	6.0	NaN
15881	year_1_complete_71_arm_1	2011-2012	2011.0	1.0	3.0	3.0	...	1147	91.0	NaN	PPVT	2011	Bilateral HA	4.0	True	6.0	NaN
23735	year_2_complete_71_arm_1	2012-2013	2012.0	1.0	3.0	3.0	...	1147	95.0	NaN	EVT	2012	Bilateral HA	4.0	True	7.0	NaN
23736	year_2_complete_71_arm_1	2012-2013	2012.0	1.0	3.0	3.0	...	1147	93.0	NaN	PPVT	2012	Bilateral HA	4.0	True	7.0	NaN
32791	year_3_complete_71_arm_1	2013-2014	2013.0	1.0	3.0	3.0	...	NaN	NaN	NaN	NaN	2013	Bilateral HA	4.0	True	NaN	NaN

9 rows × 80 columns

In [303]:

unique_students.type_hl_ad.count()

Out[303]:

In [304]:

receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape

Out[304]:

(3108,)

In [305]:

demographic.study_id.unique().shape

Out[305]:

(5898,)

In [306]:

receptive.study_id.unique().shape

Out[306]:

(3108,)

In [307]:

lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape

Out[307]:

(3108,)

In [308]:

receptive_ids = receptive.study_id.unique()

In [309]:

demographic_ids = demographic.study_id.unique()

In [310]:

[s for s in receptive_ids if s not in demographic_ids]

Out[310]:

[]

In [311]:

def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    summary.index = summary.index.values.astype(int)
    return summary[['Sample Size','Mean','SD','Min','Max']]

In [312]:

receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary

Out[312]:

	Sample Size	Mean	SD	Min	Max
2	424	93.759434	17.998914	40.0	144.0
3	1444	92.173823	19.124304	0.0	153.0
4	1582	90.716814	20.243070	0.0	149.0
5	1189	89.994113	18.050597	0.0	142.0
6	678	85.961652	16.160065	40.0	154.0
7	442	83.244344	16.113797	40.0	130.0
8	313	80.651757	17.500828	20.0	132.0
9	235	78.629787	17.568035	25.0	160.0
10	194	76.479381	17.488178	20.0	123.0
11	463	78.539957	18.944497	20.0	134.0

In [313]:

receptive_summary.describe()

Out[313]:

	Sample Size	Mean	SD	Min	Max
count	10.000000	10.000000	10.000000	10.0000	10.000000
mean	696.400000	85.015106	17.919228	20.5000	142.100000
std	515.522863	6.356443	1.280871	16.4063	12.068784
min	194.000000	76.479381	16.113797	0.0000	123.000000
25%	340.750000	79.135280	17.491340	5.0000	132.500000
50%	452.500000	84.602998	17.783475	20.0000	143.000000
75%	1061.250000	90.536139	18.721022	36.2500	152.000000
max	1582.000000	93.759434	20.243070	40.0000	160.000000

In [314]:

receptive_summary['Sample Size'].sum()

Out[314]:

In [315]:

receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')

Out[315]:

<matplotlib.text.Text at 0x129f16518>

In [316]:

expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary

Out[316]:

	Sample Size	Mean	SD	Min	Max
2	403	92.885856	21.971304	23.0	145.0
3	1389	93.531317	21.386317	19.0	147.0
4	1557	92.419396	21.762937	0.0	146.0
5	1160	91.680172	19.999878	0.0	150.0
6	676	87.002959	18.252711	20.0	146.0
7	441	84.133787	15.653573	38.0	131.0
8	304	83.976974	16.415685	34.0	122.0
9	221	82.036199	16.163330	36.0	145.0
10	188	82.085106	15.380841	40.0	122.0
11	464	84.771552	17.333085	18.0	146.0

In [317]:

expressive_summary['Sample Size'].sum()

Out[317]:

In [318]:

expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 800)
else:
    plt.ylim(0, 1800)

In [319]:

articulation_summary = score_summary("Articulation")
articulation_summary

Out[319]:

	Sample Size	Mean	SD	Min	Max
2	306	85.254902	14.944281	50.0	122.0
3	1215	83.656790	18.416468	40.0	126.0
4	1407	83.461265	20.866057	0.0	123.0
5	1089	82.844812	20.790949	39.0	120.0
6	638	79.460815	21.809311	39.0	115.0
7	415	78.101205	22.341971	3.0	112.0
8	268	79.313433	21.212468	40.0	107.0
9	195	81.497436	20.757901	39.0	109.0
10	149	81.516779	20.128507	40.0	107.0
11	326	81.733129	19.477465	39.0	105.0

In [320]:

articulation_summary['Sample Size'].sum()

Out[320]:

In [321]:

sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);

Language scores

In [322]:

lsl_dr.domain.unique()

Out[322]:

array(['Expressive Vocabulary', 'Language', 'Articulation', nan,
       'Receptive Vocabulary'], dtype=object)

In [323]:

lsl_dr.test_type.unique()

Out[323]:

array(['EOWPVT', 'receptive', 'expressive', 'Goldman', nan, 'ROWPVT',
       'Arizonia', 'EVT', 'PPVT', 'Arizonia and Goldman', 'EOWPVT and EVT',
       'PPVT and ROWPVT'], dtype=object)

In [324]:

receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary

Out[324]:

	Sample Size	Mean	SD	Min	Max
2	988	86.411943	22.293414	50.0	150.0
3	1408	84.969460	19.728716	50.0	144.0
4	1391	85.321352	19.453493	43.0	145.0
5	985	83.943147	18.823820	47.0	140.0
6	515	78.081553	17.745640	11.0	127.0
7	331	76.129909	18.941810	40.0	123.0
8	201	74.880597	19.700652	40.0	127.0
9	55	70.363636	21.026759	40.0	120.0
10	47	79.617021	20.802961	40.0	120.0
11	69	77.101449	21.432620	40.0	139.0

In [325]:

receptive_language_summary['Sample Size'].sum()

Out[325]:

In [326]:

sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [327]:

expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary

Out[327]:

	Sample Size	Mean	SD	Min	Max
2	981	88.450561	18.587983	50.0	150.0
3	1410	82.344681	17.569380	20.0	147.0
4	1382	80.683792	19.533977	45.0	141.0
5	1006	78.666998	20.106123	45.0	144.0
6	536	71.820896	19.421195	6.0	140.0
7	354	67.426554	21.096070	40.0	124.0
8	211	68.312796	21.588506	40.0	119.0
9	55	65.163636	21.369556	40.0	108.0
10	47	77.574468	23.968952	40.0	119.0
11	68	73.882353	22.531258	40.0	132.0

In [328]:

expressive_language_summary['Sample Size'].sum()

Out[328]:

In [329]:

sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [330]:

(unique_students.age/12.).describe()

Out[330]:

count    5381.000000
mean        2.441863
std         2.292325
min         0.000000
25%         0.666667
50%         2.000000
75%         3.333333
max        24.833333
Name: age, dtype: float64

In [331]:

def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})

In [332]:

audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())

In [333]:

plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')

Out[333]:

<matplotlib.text.Text at 0x11bb3a198>

In [334]:

slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())

In [335]:

plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')

Out[335]:

<matplotlib.text.Text at 0x11a9d55f8>

In [336]:

sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())

In [337]:

plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')

Out[337]:

<matplotlib.text.Text at 0x11a95cc50>

In [338]:

lsl_dr.degree_hl.dropna().value_counts()

Out[338]:

6.0    17779
4.0     4722
3.0     4595
5.0     4336
2.0     1788
0.0     1273
1.0      307
Name: degree_hl, dtype: int64

In [339]:

ax = lsl_dr.degree_hl.hist(bins=7)

In [340]:

diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)

Out[340]:

<matplotlib.axes._subplots.AxesSubplot at 0x12056a5f8>

In [341]:

(lsl_dr.age_int<6).mean()

Out[341]:

0.20646594652570291

In [342]:

(lsl_dr.age<6).mean()

Out[342]:

0.13450292397660818

Counts by year

In [343]:

unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');

In [344]:

disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)

Out[344]:

<matplotlib.axes._subplots.AxesSubplot at 0x120162dd8>

The following counts of ages allows for multiple tests per year

In [345]:

test_age = (lsl_dr.assign(age_test_year=(lsl_dr.age_test/12))
     .dropna(subset=['age_test'])[['study_id','age_test','age_test_year']])
test_age.assign(age_year=test_age.age_test_year.astype(int)).age_year.value_counts().sort_index()

Out[345]:

0      507
1     1007
2     3102
3     6866
4     7319
5     5429
6     3043
7     1983
8     1297
9      761
10     625
11     480
12     314
13     201
14     152
15      99
16      84
17      33
18      12
19       3
20       8
21       2
60       2
Name: age_year, dtype: int64

This summary counts children only once per year:

In [346]:

from itertools import chain

unique_age_vals = (test_age.assign(age_year=test_age.age_test_year.astype(int))
             .groupby('study_id')
             .age_year.unique().tolist())

pd.Series(np.concatenate(unique_age_vals)).value_counts().sort_index()

Out[346]:

0      242
1      476
2      986
3     1631
4     1721
5     1302
6      766
7      510
8      371
9      263
10     215
11     172
12     114
13      81
14      63
15      47
16      39
17      18
18       7
19       1
20       3
21       1
60       1
dtype: int64

In [347]:

test_age.assign(age_year=test_age.age_test_year.astype(int)).groupby('study_id').age_year.last().value_counts().sort_index()

Out[347]:

0     113
1     201
2     419
3     505
4     803
5     707
6     392
7     233
8     161
9     109
10     85
11     81
12     58
13     32
14     29
15     22
16     23
17     14
18      6
19      1
20      2
21      1
Name: age_year, dtype: int64

In [ ]: