In [1]:

# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:

from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)

In [3]:

metadata = lsl_dr_project.export_metadata()

In [4]:

# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:

articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})

In [6]:

records = lsl_dr_project.export_records(fields=articulation_fields)

In [7]:

print(records[0]['study_id'])

0101-2003-0101

In [8]:

expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})

In [9]:

receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})

In [10]:

language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})

In [11]:

demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})

In [12]:

demographic_raw[demographic_raw.study_id=='1147-2010-0064']

Out[12]:

	study_id	redcap_event_name	academic_year	gender	race	prim_lang	sib	mother_ed	father_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
13328	1147-2010-0064	initial_assessment_arm_1	2010-2011	0	0	0	1	3	3	...	3	6	65	0	NaN	NaN	NaN	NaN	NaN	NaN
13329	1147-2010-0064	year_1_complete_71_arm_1	2011-2012	NaN	NaN	NaN	NaN	NaN	NaN	...	3	5	77	2	NaN	NaN	NaN	NaN	NaN	NaN
13330	1147-2010-0064	year_2_complete_71_arm_1	2012-2013	NaN	NaN	NaN	NaN	NaN	NaN	...	3	5	89	2	NaN	NaN	NaN	NaN	NaN	NaN
13331	1147-2010-0064	year_3_complete_71_arm_1	2013-2014	NaN	NaN	NaN	NaN	NaN	NaN	...	4	5	101	2	NaN	NaN	NaN	NaN	NaN	NaN

4 rows × 46 columns

Attendance information¶

Several fields in the demographic data have missing values.

In [13]:

demographic_raw.head()

Out[13]:

	study_id	redcap_event_name	academic_year	gender	race	prim_lang	sib	mother_ed	father_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
0	0101-2003-0101	initial_assessment_arm_1	2002-2003	0	0	0	1	6	6	...	2	2	54	2	NaN	NaN	NaN	NaN	NaN	NaN
1	0101-2003-0101	year_1_complete_71_arm_1	2003-2004	NaN	NaN	NaN	NaN	NaN	NaN	...	4	4	80	1	NaN	NaN	NaN	NaN	NaN	NaN
2	0101-2003-0101	year_2_complete_71_arm_1	2004-2005	NaN	NaN	NaN	NaN	NaN	NaN	...	4	4	80	2	NaN	NaN	NaN	NaN	NaN	NaN
3	0101-2003-0101	year_3_complete_71_arm_1	2005-2006	NaN	NaN	NaN	NaN	NaN	NaN	...	5	5	96	3	NaN	NaN	NaN	NaN	NaN	NaN
4	0101-2003-0101	year_4_complete_71_arm_1	2006-2007	NaN	NaN	NaN	NaN	NaN	NaN	...	5	5	109	2	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 46 columns

We can fill missing values forward from previous observation (by study_id)

In [14]:

demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id

Random check to make sure this worked

In [15]:

demographic[demographic.study_id=='1147-2010-0064']

Out[15]:

	redcap_event_name	academic_year	sib	mother_ed	father_ed	premature_age	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
13328	initial_assessment_arm_1	2010-2011	1	3	3	8	...	6	65	0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
13329	year_1_complete_71_arm_1	2011-2012	1	3	3	8	...	5	77	2	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
13330	year_2_complete_71_arm_1	2012-2013	1	3	3	8	...	5	89	2	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
13331	year_3_complete_71_arm_1	2013-2014	1	3	3	8	...	5	101	2	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064

4 rows × 46 columns

Demographic data without missing values:

In [16]:

demographic.head()

Out[16]:

	redcap_event_name	academic_year	gender	race	sib	mother_ed	father_ed	premature_age	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
9459	initial_assessment_arm_1	2009-2010	0	0	0	5	4	9	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0735-2010-0017
9451	initial_assessment_arm_1	2010-2011	0	7	2	6	6	7	...	2	17	1	NaN	NaN	NaN	NaN	NaN	NaN	0735-2010-0015
9447	initial_assessment_arm_1	2010-2011	0	7	2	6	6	7	...	2	17	1	NaN	NaN	NaN	NaN	NaN	NaN	0735-2010-0014
9443	initial_assessment_arm_1	2009-2010	1	7	1	6	6	8	...	2	14	0	NaN	NaN	NaN	NaN	NaN	NaN	0735-2010-0013
5594	initial_assessment_arm_1	2005-2006	1	0	3	2	2	8	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0521-2006-0025

5 rows × 46 columns

Cleaning languge dataset¶

5 language measures:

3 versions of CELF
PLS
- pls_ac_rs: PLS: Auditory Comprehension Raw Score
- pls_ac_ss: PLS: Auditory Comprehension Standard Score
- pls_ec_rs: PLS: Expressive Communication Raw Score
- pls_ec_ss: PLS: Expressive Communication Standard Score
- pls_tl_rs: PLS: Total Language Score Standard Score Total
- pls_tl_ss: PLS: Total Language Score Standard Score
OWLS
- age_test_owls: Age at time of testing (OWLS)
- owls_lc_rs: OWLS: Listening Comprehension Raw Score
- owls_lc_ss: OWLS: Listening Comprehension Standard Score
- owls_oe_rs: OWLS: Oral Expression Raw Score
- owls_oe_ss: OWLS: Oral Expression Standard Score
- owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
- owls_oc_ss: OWLS: Oral Composite Standard Score
- owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
- owls_wes_as: OWLS: Written Expression Scale Ability Score
- owls_wes_ss: OWLS: Written Expression Scale Standard Score
- owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
- owls_lcss: OWLS: Language Composite Standard Score

In [17]:

# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))

test_type  expressive  receptive
test_name                       
CELF-4            591        523
CELF-P2          1357       1363
OWLS             1058       1066
PLS              3349       3359
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [18]:

language["school"] = language.study_id.str.slice(0,4)

In [19]:

language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()

Out[19]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain
0	0101-2003-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language
5	0101-2003-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language

Cleaning articulation dataset¶

We converted the articulation dataset into a "long" format:

In [20]:

# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]

Goldman                 5008
Arizonia                 493
Arizonia and Goldman      73
dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [21]:

articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [22]:

articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())

count    5571.000000
mean       68.695028
std        30.661547
min        23.000000
25%        47.000000
50%        60.000000
75%        80.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [23]:

articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()

Out[23]:

	study_id	redcap_event_name	test_type	score	school	age_test	domain
1	0101-2003-0101	year_1_complete_71_arm_1	Goldman	78	0101	80	Articulation
9	0101-2003-0102	initial_assessment_arm_1	Goldman	72	0101	44	Articulation
10	0101-2003-0102	year_1_complete_71_arm_1	Goldman	97	0101	54	Articulation
14	0101-2004-0101	year_2_complete_71_arm_1	Goldman	75	0101	53	Articulation
15	0101-2004-0101	year_3_complete_71_arm_1	Goldman	80	0101	66	Articulation

Cleaning demographic dataset¶

We excluded unwanted columns and rows for which age, gender or race were missing:

In [24]:

# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [25]:

demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))

False    11068
True      2450
dtype: int64
There are 694 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

0=no high school diploma
1=high school
2=undergraduate
3=graduate

Category 6 (unknown) was recoded as missing.

In [26]:

demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))

_mother_ed:
6    4960
4    2876
3    1922
5    1527
2    1320
1     470
0     192
dtype: int64
mother_ed:
1    3242
2    2876
3    1527
0     662
dtype: int64

There are 5905 null values for mother_ed

Secondary diagnosis

In [27]:

demographic.shape

Out[27]:

(14212, 48)

In [28]:

demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None

In [29]:

demographic.secondary_diagnosis.value_counts()

Out[29]:

0    10371
1     2394
dtype: int64

In [30]:

demographic.secondary_diagnosis.mean()

Out[30]:

0.18754406580493538

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [31]:

demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))

There are 3410 null values for premature_weeks

In [32]:

demographic.premature_weeks.value_counts()

Out[32]:

0     9214
2      554
4      358
12     193
6      178
10     148
8      112
14      42
16       3
dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [33]:

demographic.tech_ad.value_counts()

Out[33]:

1    4813
0    4212
7    1462
5     970
2     474
6     413
8      69
9      57
3      27
4      25
dtype: int64

In [34]:

tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None

In [35]:

demographic.tech_left.value_counts()

Out[35]:

2    6360
3    4272
0    1789
4      56
1      19
dtype: int64

In [36]:

demographic.tech_right.value_counts()

Out[36]:

2    6282
3    4694
0    1462
4      57
1      27
dtype: int64

Substitute valid missing values for hearing loss:

In [37]:

demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [38]:

demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

0=none
1=one ear
2=both ears.

In [39]:

demographic.columns

Out[39]:

Index(['redcap_event_name', 'academic_year', 'hl', 'male', 'race', 'prim_lang',
       'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp',
       'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2',
       'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad',
       'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as',
       'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled',
       'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv',
       'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses',
       'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed',
       'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left',
       'degree_hl'],
      dtype='object')

In [40]:

demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))

oad:
0    4384
1       4
2       2
dtype: int64
There are 1639 null values for OAD

hearing_aid:
2    2031
0    1593
1     737
dtype: int64
There are 1690 null values for hearing_aid

cochlear:
0    2868
2     897
1     625
dtype: int64
There are 1639 null values for cochlear
14212

Identify bilateral and bimodal individuals:

In [41]:

demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)

In [42]:

demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()

Out[42]:

(3445, 5169, 1385, 2076)

In [43]:

demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()

Out[43]:

unilateral_ci     625
bilateral_ci      897
bilateral_ha     2031
bimodal           375
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [44]:

demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))

There are 0 null values for tech

In [45]:

demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()

Out[45]:

6    5169
3    3445
4    1385
1     895
0     672
8      14
2      12
7       5
5       1
dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [46]:

demographic.onset_1.unique()

Out[46]:

array([   1. ,   18. ,   17. ,   22. ,    nan,    3. ,    2. ,    5. ,
         13. ,   36. ,   28. ,   21. ,   41. ,   26. ,    0. ,   67. ,
         20. ,   24. ,    4. ,   40. ,   60. ,   10. ,    6. ,   25. ,
          7. ,   27. ,   15. ,   35. ,   14. ,   42. ,   34. ,   12. ,
          9. ,   32. ,   50. ,    8. ,   33. ,   23. ,   11. ,   31. ,
         30. ,   49. ,   48. ,    1.5,   19. ,    2.5,   39. ,   52. ,
         16. ,   38. ,   29. ,   51. ,   46. ,   45. ,   54. ,   88. ,
         65. ,   44. ,   81. ,  116. ,   72. ,   57. ,   62. ,   43. ,
         78. ,   83. ,   61. ,  107. ,   64. ,   74. ,   37. ,   77. ,
         96. ,   97. ,   79. ,   47. ,   53. ,   59. ,   84. ,   95. ,
         80. ,    0.5,   58. ,   56. ,   86. ,   98. ,   85. ,   75. ,
        119. ,   66. ,   70. ,   63. ,  140. ,  126. ,  133. ,  103. ,
         87. ,   76. ,   55. ,   68. ,   92. ,   71. ,  154. ,   89. ,
        152. ])

In [47]:

# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [48]:

demographic.age_diag.isnull().sum()

Out[48]:

In [49]:

demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})

In [50]:

import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')

Child has another diagnosed disability

In [51]:

demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None

In [52]:

# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [53]:

demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [54]:

races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column

_race:
0    7442
2    2374
1    1287
3    1006
6     691
8     519
7     240
4      64
5      28
dtype: int64
race:
0    7442
2    2374
4    1302
1    1287
3    1006
dtype: int64
There are 801 null values for race

Recode implant technology variables

In [55]:

tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)

In [ ]:

# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)

In [57]:

demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012',
                                   '0000-0000': np.nan}).str.replace('*', '-').unique()

Out[57]:

array(['2009-2010', '2010-2011', '2005-2006', nan, '2007-2008',
       '2006-2007', '2008-2009', '2003-2004', '2012-2013', '2013-2014',
       '2002-2003', '2011-2012', '2014-2015', 'June 2014', '2004-2005',
       '1997-1998', '2006-2007 ', '1998-1999', '2000-2001', '2001-2002',
       '2103-2014', '1999-2000', '2012', '2009-2011', '2015-2016',
       '1995-1996', '2014', '2013', '2011', '2010', '2009',
       '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 2010-2011',
       ' 2014-2015', '2012=2013', '2015', '2015-2015', '2009 - 2010',
       '2010 - 2011', '2011 - 2012', '2014-2105', '2014-2015 ', '65',
       '2005-2004', '2012 - 2013', '2014-205', '2013 - 2014', '2017-2015',
       '2014-1015', '2012-2013 '], dtype=object)

In [87]:

demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
                          '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
                          '2015-2015': '2014-2015', '2009-2011': '2009-2010',
                                   '0000-0000': np.nan}).str.replace('*', '-')

Removed entries that don't contain dashes

In [88]:

demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 
                'academic_year'] = np.nan

In [92]:

demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))

In [93]:

demographic.age_amp.hist()

Out[93]:

<matplotlib.axes._subplots.AxesSubplot at 0x114b379b0>

Cleaning expressive vocabulary dataset¶

We converted the expressive vocabulary dataset to "long" format:

In [94]:

# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]

There are 0 null values for test_type

In [95]:

expressive.test_type.value_counts()

Out[95]:

EVT               3642
EOWPVT            2624
EOWPVT and EVT     147
dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [96]:

expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [97]:

expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [98]:

expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()

Out[98]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
0	0101-2003-0101	initial_assessment_arm_1	58	EOWPVT	0101	54	Expressive Vocabulary
2	0101-2003-0101	year_2_complete_71_arm_1	84	EOWPVT	0101	80	Expressive Vocabulary
5	0101-2003-0101	year_5_complete_71_arm_1	90	EOWPVT	0101	113	Expressive Vocabulary
14	0101-2004-0101	year_2_complete_71_arm_1	90	EOWPVT	0101	53	Expressive Vocabulary
15	0101-2004-0101	year_3_complete_71_arm_1	87	EOWPVT	0101	66	Expressive Vocabulary

Cleaning receptive vocabulary dataset¶

We converted the receptive vocabulary data table to "long" format:

In [99]:

receptive.columns

Out[99]:

Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')

In [100]:

# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]

There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [101]:

receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [102]:

receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]

In [103]:

print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))

There are 27 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [104]:

receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()

Out[104]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
2	0101-2003-0101	year_2_complete_71_arm_1	90	PPVT	0101	80	Receptive Vocabulary
5	0101-2003-0101	year_5_complete_71_arm_1	101	ROWPVT	0101	113	Receptive Vocabulary
9	0101-2003-0102	initial_assessment_arm_1	55	PPVT	0101	44	Receptive Vocabulary
10	0101-2003-0102	year_1_complete_71_arm_1	80	PPVT	0101	54	Receptive Vocabulary
11	0101-2003-0102	year_2_complete_71_arm_1	101	PPVT	0101	68	Receptive Vocabulary

In [105]:

receptive.study_id.unique().shape

Out[105]:

(2959,)

Merge datasets¶

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [106]:

test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [107]:

lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')

In [108]:

lsl_dr.tail()

Out[108]:

	redcap_event_name	academic_year	male	_race	sib	_mother_ed	father_ed	premature_age	...	sex	race	age_test	domain	school	score	test_name	test_type
36577	year_9_complete_71_arm_1	2012-2013	0	8	2	6	4	8	...	Female	4	136	Expressive Vocabulary	1147	92	NaN	EVT
36578	year_9_complete_71_arm_1	2012-2013	0	8	2	6	4	8	...	Female	4	136	Receptive Vocabulary	1147	84	NaN	PPVT
36579	year_9_complete_71_arm_1	2012-2013	0	0	3	6	6	9	...	Female	0	NaN	NaN	NaN	NaN	NaN	NaN
36580	year_9_complete_71_arm_1	2013-2014	1	0	2	NaN	6	8	...	Male	0	185	Expressive Vocabulary	1147	125	NaN	EVT
36581	year_9_complete_71_arm_1	2013-2014	1	0	2	NaN	6	8	...	Male	0	186	Receptive Vocabulary	1147	101	NaN	PPVT

5 rows × 73 columns

In [109]:

lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()

Out[109]:

2013    6924
2012    6628
2014    5480
2011    5205
2010    4419
nan     3177
2009    2358
2008     825
2007     531
2006     343
2005     285
2004     173
2003      90
2002      47
2001      35
1998      16
1999      15
2000      12
2015      11
1997       6
1995       1
2103       1
dtype: int64

In [110]:

current_year_only = False

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']

In [111]:

expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

In [112]:

expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [113]:

if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')

In [114]:

lsl_dr.shape

Out[114]:

(36582, 74)

In [115]:

lsl_dr.study_id.unique().shape

Out[115]:

(5440,)

In [116]:

demographic.study_id.unique().shape

Out[116]:

(5440,)

Convert score to floating-point number

In [117]:

lsl_dr.score = lsl_dr.score.astype(float)

In [119]:

lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'

In [120]:

lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)

In [121]:

lsl_dr.domain.dropna().unique()

Out[121]:

array(['Articulation', 'Expressive Vocabulary', 'Language',
       'Receptive Vocabulary'], dtype=object)

In [122]:

lsl_dr.groupby('tech_class').prim_lang.mean().round(2)

Out[122]:

tech_class
Bilateral CI    0.43
Bilateral HA    0.58
Bimodal         0.51
Name: prim_lang, dtype: float64

In [123]:

lsl_dr['non_profound'] = lsl_dr.degree_hl<6

In [124]:

lsl_dr.groupby('tech_class').non_profound.mean().round(2)

Out[124]:

tech_class
Bilateral CI    0.08
Bilateral HA    0.86
Bimodal         0.30
Name: non_profound, dtype: float64

In [125]:

f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
    plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
    plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
    ax.set_ylim(40, 120)
    ax.set_xticks(range(2,7))
    ax.set_title(dom)

In [126]:

lsl_dr.pivot_table?

Plots of Demographic Data¶

In [127]:

plot_color = "#64AAE8"

In [128]:

def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index(1)
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()

In [129]:

unique_students = demographic.drop_duplicates('study_id')

In [130]:

unique_students.shape

Out[130]:

(5440, 67)

In [131]:

unique_students.age.describe()

Out[131]:

count    4949.000000
mean       30.353708
std        28.072288
min         0.000000
25%         9.000000
50%        25.000000
75%        41.000000
max       298.000000
Name: age, dtype: float64

In [132]:

plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)

In [133]:

plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)

In [134]:

unique_students.prim_lang.count()

Out[134]:

In [135]:

plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)

In [136]:

unique_students.sib.count()

Out[136]:

In [137]:

amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))

There are 4857 null values for age_amp

In [138]:

age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))

In [139]:

age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))

In [140]:

age_amp_counts.sum()

Out[140]:

In [141]:

unique_students.age_amp.max()

Out[141]:

666.0

In [142]:

(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')

Out[142]:

<matplotlib.text.Text at 0x10f927e80>

In [143]:

plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)

In [144]:

plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))

In [145]:

f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)

In [146]:

unique_students.tech_right.count()

Out[146]:

In [147]:

unique_students.tech_left.count()

Out[147]:

In [148]:

degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)

In [149]:

_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');

In [150]:

unique_students.degree_hl_as.count()

Out[150]:

In [151]:

type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)

In [152]:

plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)

In [153]:

unique_students.type_hl_ad.count()

Out[153]:

In [154]:

unique_students.type_hl_as.count()

Out[154]:

In [155]:

type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)

In [156]:

demographic[demographic.study_id=='1147-2010-0064']

Out[156]:

	redcap_event_name	academic_year	sib	_mother_ed	father_ed	premature_age	...	bilateral_ci	bilateral_ha	bimodal	implant_category	age_diag	sex
13328	initial_assessment_arm_1	2010-2011	1	3	3	8	...	False	True	False	6	51	Female
13329	year_1_complete_71_arm_1	2011-2012	1	3	3	8	...	False	True	False	6	51	Female
13330	year_2_complete_71_arm_1	2012-2013	1	3	3	8	...	False	True	False	6	51	Female
13331	year_3_complete_71_arm_1	2013-2014	1	3	3	8	...	False	True	False	6	51	Female

4 rows × 67 columns

In [157]:

receptive[receptive.study_id=='1147-2010-0064']

Out[157]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
13328	1147-2010-0064	initial_assessment_arm_1	96	PPVT	1147	63	Receptive Vocabulary
13329	1147-2010-0064	year_1_complete_71_arm_1	91	PPVT	1147	73	Receptive Vocabulary
13330	1147-2010-0064	year_2_complete_71_arm_1	93	PPVT	1147	85	Receptive Vocabulary

In [158]:

lsl_dr[lsl_dr.study_id=='1147-2010-0064']

Out[158]:

	redcap_event_name	academic_year	sib	_mother_ed	father_ed	premature_age	...	age_test	domain	school	score	test_name	test_type	academic_year_start	tech_class	age_year	non_profound
8588	initial_assessment_arm_1	2010-2011	1	3	3	8	...	63	Expressive Vocabulary	1147	91	NaN	EVT	2010	Bilateral HA	4	True
8589	initial_assessment_arm_1	2010-2011	1	3	3	8	...	63	Receptive Vocabulary	1147	96	NaN	PPVT	2010	Bilateral HA	4	True
8590	initial_assessment_arm_1	2010-2011	1	3	3	8	...	59	Language	1147	101	PLS	receptive	2010	Bilateral HA	4	True
8591	initial_assessment_arm_1	2010-2011	1	3	3	8	...	59	Language	1147	87	PLS	expressive	2010	Bilateral HA	4	True
19250	year_1_complete_71_arm_1	2011-2012	1	3	3	8	...	72	Expressive Vocabulary	1147	86	NaN	EVT	2011	Bilateral HA	4	True
19251	year_1_complete_71_arm_1	2011-2012	1	3	3	8	...	73	Receptive Vocabulary	1147	91	NaN	PPVT	2011	Bilateral HA	4	True
20546	year_2_complete_71_arm_1	2012-2013	1	3	3	8	...	88	Expressive Vocabulary	1147	95	NaN	EVT	2012	Bilateral HA	4	True
20547	year_2_complete_71_arm_1	2012-2013	1	3	3	8	...	85	Receptive Vocabulary	1147	93	NaN	PPVT	2012	Bilateral HA	4	True
29121	year_3_complete_71_arm_1	2013-2014	1	3	3	8	...	NaN	NaN	NaN	NaN	NaN	NaN	2013	Bilateral HA	4	True

9 rows × 77 columns

In [159]:

unique_students.type_hl_ad.count()

Out[159]:

In [160]:

receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape

Out[160]:

(2959,)

In [161]:

demographic.study_id.unique().shape

Out[161]:

(5440,)

In [162]:

receptive.study_id.unique().shape

Out[162]:

(2959,)

In [163]:

lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape

Out[163]:

(2959,)

In [164]:

receptive_ids = receptive.study_id.unique()

In [165]:

demographic_ids = demographic.study_id.unique()

In [166]:

[s for s in receptive_ids if s not in demographic_ids]

Out[166]:

[]

In [167]:

def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    summary.index = summary.index.values.astype(int)
    return summary[['Sample Size','Mean','SD','Min','Max']]

In [168]:

receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary

Out[168]:

	Sample Size	Mean	SD	Min	Max
2	395	93.101266	18.115391	40	144
3	1363	91.903155	19.384986	0	150
4	1497	90.636607	20.373490	0	149
5	1122	89.802139	18.191470	0	142
6	620	85.562903	16.478926	40	154
7	411	82.951338	15.993827	40	130
8	292	80.458904	17.713500	20	132
9	216	77.805556	17.770086	25	160
10	185	75.875676	17.296981	20	123
11	449	78.632517	19.118076	20	134

In [169]:

receptive_summary.describe()

Out[169]:

	Sample Size	Mean	SD	Min	Max
count	10.000000	10.000000	10.000000	10.0000	10.000000
mean	655.000000	84.673006	18.043673	20.5000	141.800000
std	488.137503	6.392185	1.324983	16.4063	11.802071
min	185.000000	75.875676	15.993827	0.0000	123.000000
25%	317.750000	79.089114	17.401111	5.0000	132.500000
50%	430.000000	84.257121	17.942739	20.0000	143.000000
75%	996.500000	90.427990	18.886424	36.2500	149.750000
max	1497.000000	93.101266	20.373490	40.0000	160.000000

In [170]:

receptive_summary['Sample Size'].sum()

Out[170]:

In [171]:

receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')

Out[171]:

<matplotlib.text.Text at 0x10f8e2a58>

In [172]:

expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary

Out[172]:

	Sample Size	Mean	SD	Min	Max
2	375	92.362667	22.025792	23	141
3	1310	93.083206	21.785851	0	145
4	1473	92.156144	21.912841	0	146
5	1097	91.375570	20.134114	0	145
6	618	86.344660	18.457169	20	146
7	414	83.814010	15.712203	38	131
8	283	83.893993	16.552176	34	122
9	202	81.321782	16.143647	36	145
10	179	81.564246	15.327513	40	122
11	450	84.935556	17.521297	18	146

In [173]:

expressive_summary['Sample Size'].sum()

Out[173]:

In [174]:

expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 400)
else:
    plt.ylim(0, 1400)

In [175]:

articulation_summary = score_summary("Articulation")
articulation_summary

Out[175]:

	Sample Size	Mean	SD	Min	Max
2	282	85.124113	15.168643	50	122
3	1131	83.458002	18.272973	40	126
4	1309	83.409473	20.701802	0	121
5	1026	83.880117	35.350375	39	999
6	583	78.895369	21.792075	39	112
7	389	80.095116	51.718446	3	999
8	244	78.959016	21.168498	40	107
9	169	81.118343	20.524903	40	108
10	131	80.954198	20.111177	40	105
11	305	81.947541	19.283989	39	105

In [176]:

articulation_summary['Sample Size'].sum()

Out[176]:

In [178]:

sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);

Language scores

In [179]:

lsl_dr.domain.unique()

Out[179]:

array([nan, 'Articulation', 'Expressive Vocabulary', 'Language',
       'Receptive Vocabulary'], dtype=object)

In [180]:

lsl_dr.test_type.unique()

Out[180]:

array([nan, 'Goldman', 'EVT', 'receptive', 'expressive', 'EOWPVT',
       'ROWPVT', 'PPVT', 'EOWPVT and EVT', 'Arizonia',
       'Arizonia and Goldman', 'PPVT and ROWPVT'], dtype=object)

In [181]:

receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary

Out[181]:

	Sample Size	Mean	SD	Min	Max
2	932	85.894850	21.996331	50	150
3	1315	84.855513	19.730693	50	144
4	1301	85.104535	19.598740	43	145
5	930	83.733333	18.762635	47	140
6	479	77.805846	17.642483	11	127
7	318	75.877358	18.713363	40	123
8	195	74.641026	19.685003	40	123
9	52	69.846154	20.649631	40	109
10	42	77.000000	20.167591	40	119
11	68	75.602941	21.490810	40	139

In [182]:

receptive_language_summary['Sample Size'].sum()

Out[182]:

In [183]:

sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [184]:

expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary

Out[184]:

	Sample Size	Mean	SD	Min	Max
2	926	88.035637	18.268384	50	150
3	1316	82.285714	17.541316	20	147
4	1293	80.304718	19.572599	45	141
5	948	78.530591	20.053469	45	144
6	496	71.512097	19.214889	6	140
7	338	66.789941	20.660322	40	124
8	200	67.520000	21.275168	40	118
9	51	64.725490	20.570929	40	106
10	42	74.595238	23.463578	40	119
11	67	73.417910	22.766369	40	132

In [185]:

expressive_language_summary['Sample Size'].sum()

Out[185]:

In [186]:

sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [187]:

(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')

Out[187]:

<matplotlib.text.Text at 0x10f933d30>

In [188]:

(unique_students.age/12.).describe()

Out[188]:

count    4949.000000
mean        2.529476
std         2.339357
min         0.000000
25%         0.750000
50%         2.083333
75%         3.416667
max        24.833333
Name: age, dtype: float64

In [189]:

def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})

In [190]:

audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())

In [191]:

plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')

Out[191]:

<matplotlib.text.Text at 0x10f8dd5c0>

In [192]:

slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())

In [193]:

plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')

Out[193]:

<matplotlib.text.Text at 0x110ea1da0>

In [194]:

sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())

In [195]:

plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')

Out[195]:

<matplotlib.text.Text at 0x10f94ab00>

In [196]:

lsl_dr.degree_hl.dropna().value_counts()

Out[196]:

6    16633
4     4334
3     4159
5     4058
2     1653
0     1270
1      281
dtype: int64

In [197]:

ax = lsl_dr.degree_hl.hist(bins=7)

In [198]:

diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)

Out[198]:

<matplotlib.axes._subplots.AxesSubplot at 0x10e5722b0>

In [199]:

(lsl_dr.age_int<6).mean()

Out[199]:

0.19320977529932754

In [200]:

(lsl_dr.age<6).mean()

Out[200]:

0.12798644141927723

Counts by year

In [201]:

lsl_dr.groupby('study_id').first()

Out[201]:

	redcap_event_name	academic_year	hl	male	_race	prim_lang	sib	_mother_ed	father_ed	premature_age	...	age_test	domain	school	score	test_name	test_type	academic_year_start	tech_class	age_year	non_profound
study_id
00624-2010-0049	initial_assessment_arm_1	2013-2014	0	1	2	0	NaN	6	6	9	...	24	Language	0062	50	PLS	receptive	2013	Bilateral CI	2	False
0101-2003-0101	initial_assessment_arm_1	2002-2003	0	0	0	0	1	6	6	9	...	54	Expressive Vocabulary	0101	58	PLS	EOWPVT	2002	Bimodal	4	False
0101-2003-0102	initial_assessment_arm_1	2003-2004	0	0	0	0	1	2	2	8	...	44	Articulation	0101	72	PLS	Goldman	2003	Bilateral HA	3	True
0101-2004-0101	initial_assessment_arm_1	2006-2007	0	1	0	0	0	6	6	8	...	37	Receptive Vocabulary	0101	62	PLS	PPVT	2006	Bimodal	2	True
0101-2004-0102	initial_assessment_arm_1	2004-2005	0	0	0	0	1	5	6	9	...	NaN	NaN	NaN	NaN	NaN	NaN	2004	Bimodal	0	True
0101-2004-0103	initial_assessment_arm_1	2012-2013	0	1	0	0	1	4	4	8	...	96	Expressive Vocabulary	0101	104	CELF-4	EVT	2012	Bilateral CI	0	False
0101-2004-0104	initial_assessment_arm_1	2004-2005	0	1	0	0	1	6	6	8	...	32	Articulation	0101	84	PLS	Goldman	2004	Bilateral HA	0	True
0101-2004-0105	initial_assessment_arm_1	2004-2005	0	0	0	0	2	6	6	9	...	47	Articulation	0101	78	CELF-P2	Goldman	2004	Bimodal	2	False
0101-2005-0101	initial_assessment_arm_1	2006-2007	0	1	0	0	2	5	4	8	...	28	Articulation	0101	61	PLS	Goldman	2006	Bilateral HA	2	True
0101-2005-0102	initial_assessment_arm_1	2004-2005	0	1	0	0	2	3	2	9	...	63	Articulation	0101	87	CELF-P2	Goldman	2004	Bilateral HA	4	True
0101-2006-0101	initial_assessment_arm_1	2005-2006	0	0	0	0	1	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	2005	Bimodal	0	False
0101-2006-0104	initial_assessment_arm_1	2006-2007	0	0	0	0	0	5	5	9	...	NaN	NaN	NaN	NaN	NaN	NaN	2006	Bilateral CI	2	False
0101-2007-0104	initial_assessment_arm_1	2007-2008	0	0	0	0	1	4	6	9	...	41	Articulation	0101	122	NaN	Goldman	2007	Bimodal	4	True
0101-2007-0105	initial_assessment_arm_1	2007-2008	0	1	0	0	0	6	6	9	...	NaN	NaN	NaN	NaN	NaN	NaN	2007	Bimodal	11	False
0101-2007-0107	initial_assessment_arm_1	2005-2006	0	0	0	0	1	4	4	8	...	NaN	NaN	NaN	NaN	NaN	NaN	2005	Bilateral HA	0	True
0101-2008-0102	initial_assessment_arm_1	2008-2009	0	0	0	0	1	6	6	9	...	NaN	NaN	NaN	NaN	NaN	NaN	2008	Bimodal	14	False
0101-2008-0106	initial_assessment_arm_1	2007-2008	0	0	0	0	1	4	4	8	...	NaN	NaN	NaN	NaN	NaN	NaN	2007	Bilateral HA	0	True
0101-2009-0101	initial_assessment_arm_1	2008-2009	0	0	0	0	1	6	6	9	...	NaN	NaN	NaN	NaN	NaN	NaN	2008	Bimodal	6	False
0101-2010-0101	initial_assessment_arm_1	2008-2009	0	1	0	0	1	6	6	9	...	104	Articulation	0101	90	CELF-4	Arizonia	2008	Bilateral HA	8	True
0101-2010-0103	initial_assessment_arm_1	2010-2011	0	0	0	0	2	4	3	8	...	25	Language	0101	63	PLS	receptive	2010	Bilateral HA	0	False
0101-2010-0104	initial_assessment_arm_1	2010-2011	0	1	3	0	1	2	2	8	...	30	Expressive Vocabulary	0101	90	PLS	EOWPVT	2010	Bilateral HA	0	False
0101-2010-0105	initial_assessment_arm_1	2011-2012	0	1	0	0	0	5	6	6	...	30	Language	0101	66	PLS	receptive	2011	Bilateral CI	2	False
0101-2012-0101	initial_assessment_arm_1	2013-2014	0	1	0	0	0	6	6	9	...	NaN	NaN	NaN	NaN	NaN	NaN	2013	Bimodal	2	False
0101-2013-0101	initial_assessment_arm_1	2012-2013	0	1	0	0	0	3	2	8	...	12	Language	0101	58	PLS	receptive	2012	Bilateral HA	0	True
0101-2013-0103	initial_assessment_arm_1	2012-2013	0	1	0	0	2	6	6	9	...	NaN	NaN	NaN	NaN	NaN	NaN	2012	Bilateral CI	4	False
0101-2013-0104	initial_assessment_arm_1	2012-2013	0	1	0	0	1	4	4	8	...	12	Language	0101	83	PLS	receptive	2012	Bilateral HA	0	True
0101-2013-0112	initial_assessment_arm_1	2012-2013	0	1	0	0	1	3	6	9	...	11	Language	0101	90	PLS	receptive	2012	Bilateral HA	0	True
0101-2013-0113	initial_assessment_arm_1	2013-2014	0	0	0	0	1	2	2	8	...	4	Language	0101	96	PLS	receptive	2013	Bimodal	0	True
0101-2013-0114	initial_assessment_arm_1	2013-2014	0	1	0	0	0	3	3	8	...	6	Language	0101	50	PLS	receptive	2013	Bimodal	0	True
0101-2013-0115	initial_assessment_arm_1	2013-2014	0	1	0	0	2	2	2	8	...	11	Language	0101	79	PLS	receptive	2013	Bilateral HA	0	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1151-2012-0008	initial_assessment_arm_1	NaN	0	0	2	1	2	4	2	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	3	False
1151-2012-0009	initial_assessment_arm_1	NaN	0	1	2	1	1	4	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	1	True
1151-2012-0010	initial_assessment_arm_1	NaN	0	1	2	1	1	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	2	True
1151-2012-0011	initial_assessment_arm_1	NaN	0	1	2	1	0	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	4	False
1151-2012-0012	initial_assessment_arm_1	NaN	0	1	2	1	0	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	5	False
1151-2012-0013	initial_assessment_arm_1	NaN	0	0	2	1	0	2	2	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	5	False
1151-2012-0014	initial_assessment_arm_1	NaN	0	0	2	1	2	1	1	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	NaN	False
1151-2013-0001	initial_assessment_arm_1	NaN	0	0	2	1	0	2	2	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	4	False
1151-2013-0002	initial_assessment_arm_1	NaN	0	0	2	1	2	2	3	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral CI	2	False
1151-2013-0003	initial_assessment_arm_1	NaN	0	0	2	1	0	1	1	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral CI	2	False
1151-2013-0004	initial_assessment_arm_1	NaN	0	0	2	1	2	0	1	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	3	False
1151-2013-0005	initial_assessment_arm_1	NaN	0	1	2	1	0	1	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	3	False
1151-2013-0006	initial_assessment_arm_1	NaN	0	0	2	1	0	1	1	4	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	3	False
1151-2013-0007	initial_assessment_arm_1	NaN	0	1	2	1	0	4	2	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	NaN	False
1151-2013-0008	initial_assessment_arm_1	NaN	0	1	2	1	0	3	2	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral CI	2	False
1151-2013-0009	initial_assessment_arm_1	NaN	0	1	2	1	0	2	6	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	NaN	False
1151-2013-0010	initial_assessment_arm_1	NaN	0	1	2	1	0	4	4	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral CI	6	False
1151-2013-0011	initial_assessment_arm_1	NaN	0	1	2	1	0	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	1	False
1151-2013-0012	initial_assessment_arm_1	NaN	0	1	2	1	2	3	3	5	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	3	False
1151-2014-0001	initial_assessment_arm_1	NaN	0	0	2	1	3	6	4	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral CI	2	False
1151-2014-0002	initial_assessment_arm_1	NaN	0	1	2	1	1	2	4	7	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	3	False
1151-2014-0003	initial_assessment_arm_1	NaN	0	0	2	1	0	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	3	False
1151-2014-0004	initial_assessment_arm_1	NaN	0	1	2	1	1	2	1	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	3	False
1151-2014-0005	initial_assessment_arm_1	NaN	0	0	2	1	3	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	4	False
1151-2014-0006	initial_assessment_arm_1	NaN	0	1	2	1	0	4	0	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral CI	3	False
1151-2014-0007	initial_assessment_arm_1	NaN	0	0	2	1	2	6	6	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	4	False
1151-2014-0008	initial_assessment_arm_1	NaN	0	0	2	1	1	1	1	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	5	False
1151-2014-0009	initial_assessment_arm_1	NaN	0	1	2	1	2	4	2	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral HA	6	True
1151-2014-0010	initial_assessment_arm_1	NaN	0	1	2	1	1	2	2	8	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bilateral CI	6	False
9308-2015-0002	initial_assessment_arm_1	NaN	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	nan	Bimodal	NaN	False

5440 rows × 76 columns

In [202]:

unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');

In [203]:

disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)

Out[203]:

<matplotlib.axes._subplots.AxesSubplot at 0x10f9f9198>

In [ ]: