In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)
In [3]:
metadata = lsl_dr_project.export_metadata()
In [4]:
# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
In [6]:
records = lsl_dr_project.export_records(fields=articulation_fields)
In [7]:
print(records[0]['study_id'])
0101-2003-0101
In [8]:
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})
In [9]:
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})
In [10]:
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})
In [11]:
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})
In [12]:
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
Out[12]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
13328 1147-2010-0064 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 ... 3 6 65 0 NaN NaN NaN NaN NaN NaN
13329 1147-2010-0064 year_1_complete_71_arm_1 2011-2012 0 NaN NaN NaN NaN NaN NaN ... 3 5 77 2 NaN NaN NaN NaN NaN NaN
13330 1147-2010-0064 year_2_complete_71_arm_1 2012-2013 0 NaN NaN NaN NaN NaN NaN ... 3 5 89 2 NaN NaN NaN NaN NaN NaN
13331 1147-2010-0064 year_3_complete_71_arm_1 2013-2014 0 NaN NaN NaN NaN NaN NaN ... 4 5 101 2 NaN NaN NaN NaN NaN NaN

4 rows × 46 columns

Attendance information

Several fields in the demographic data have missing values.

In [13]:
demographic_raw.head()
Out[13]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
0 0101-2003-0101 initial_assessment_arm_1 2002-2003 0 0 0 0 1 6 6 ... 2 2 54 2 NaN NaN NaN NaN NaN NaN
1 0101-2003-0101 year_1_complete_71_arm_1 2003-2004 0 NaN NaN NaN NaN NaN NaN ... 4 4 80 1 NaN NaN NaN NaN NaN NaN
2 0101-2003-0101 year_2_complete_71_arm_1 2004-2005 0 NaN NaN NaN NaN NaN NaN ... 4 4 80 2 NaN NaN NaN NaN NaN NaN
3 0101-2003-0101 year_3_complete_71_arm_1 2005-2006 0 NaN NaN NaN NaN NaN NaN ... 5 5 96 3 NaN NaN NaN NaN NaN NaN
4 0101-2003-0101 year_4_complete_71_arm_1 2006-2007 0 NaN NaN NaN NaN NaN NaN ... 5 5 109 2 NaN NaN NaN NaN NaN NaN

5 rows × 46 columns

We can fill missing values forward from previous observation (by study_id)

In [14]:
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id

Random check to make sure this worked

In [15]:
demographic[demographic.study_id=='1147-2010-0064']
Out[15]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
13328 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 6 65 0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
13329 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... 5 77 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064
13330 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... 5 89 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064
13331 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... 5 101 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064

4 rows × 46 columns

Demographic data without missing values:

In [16]:
demographic.head()
Out[16]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
9459 initial_assessment_arm_1 2009-2010 0 0 0 0 0 5 4 9 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0735-2010-0017
9451 initial_assessment_arm_1 2010-2011 0 0 7 0 2 6 6 7 ... 2 17 1 NaN NaN NaN NaN NaN NaN 0735-2010-0015
9447 initial_assessment_arm_1 2010-2011 0 0 7 0 2 6 6 7 ... 2 17 1 NaN NaN NaN NaN NaN NaN 0735-2010-0014
9443 initial_assessment_arm_1 2009-2010 0 1 7 0 1 6 6 8 ... 2 14 0 NaN NaN NaN NaN NaN NaN 0735-2010-0013
5594 initial_assessment_arm_1 2005-2006 0 1 0 0 3 2 2 8 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0521-2006-0025

5 rows × 46 columns

Cleaning languge dataset

5 language measures:

  • 3 versions of CELF
  • PLS
    • pls_ac_rs: PLS: Auditory Comprehension Raw Score
    • pls_ac_ss: PLS: Auditory Comprehension Standard Score
    • pls_ec_rs: PLS: Expressive Communication Raw Score
    • pls_ec_ss: PLS: Expressive Communication Standard Score
    • pls_tl_rs: PLS: Total Language Score Standard Score Total
    • pls_tl_ss: PLS: Total Language Score Standard Score
  • OWLS
    • age_test_owls: Age at time of testing (OWLS)
    • owls_lc_rs: OWLS: Listening Comprehension Raw Score
    • owls_lc_ss: OWLS: Listening Comprehension Standard Score
    • owls_oe_rs: OWLS: Oral Expression Raw Score
    • owls_oe_ss: OWLS: Oral Expression Standard Score
    • owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
    • owls_oc_ss: OWLS: Oral Composite Standard Score
    • owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
    • owls_wes_as: OWLS: Written Expression Scale Ability Score
    • owls_wes_ss: OWLS: Written Expression Scale Standard Score
    • owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
    • owls_lcss: OWLS: Language Composite Standard Score
In [17]:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type  expressive  receptive
test_name                       
CELF-4            591        523
CELF-P2          1357       1363
OWLS             1058       1066
PLS              3349       3359
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [18]:
language["school"] = language.study_id.str.slice(0,4)
In [19]:
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
Out[19]:
study_id redcap_event_name score test_type test_name school age_test domain
0 0101-2003-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language
5 0101-2003-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language

Cleaning articulation dataset

We converted the articulation dataset into a "long" format:

In [20]:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman                 5008
Arizonia                 493
Arizonia and Goldman      73
dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [21]:
articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [22]:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count    5571.000000
mean       68.695028
std        30.661547
min        23.000000
25%        47.000000
50%        60.000000
75%        80.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [23]:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
Out[23]:
study_id redcap_event_name test_type score school age_test domain
1 0101-2003-0101 year_1_complete_71_arm_1 Goldman 78 0101 80 Articulation
9 0101-2003-0102 initial_assessment_arm_1 Goldman 72 0101 44 Articulation
10 0101-2003-0102 year_1_complete_71_arm_1 Goldman 97 0101 54 Articulation
14 0101-2004-0101 year_2_complete_71_arm_1 Goldman 75 0101 53 Articulation
15 0101-2004-0101 year_3_complete_71_arm_1 Goldman 80 0101 66 Articulation

Cleaning demographic dataset

We excluded unwanted columns and rows for which age, gender or race were missing:

In [24]:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [25]:
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False    11068
True      2450
dtype: int64
There are 694 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

  • 0=no high school diploma
  • 1=high school
  • 2=undergraduate
  • 3=graduate

Category 6 (unknown) was recoded as missing.

In [26]:
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed:
6    4960
4    2876
3    1922
5    1527
2    1320
1     470
0     192
dtype: int64
mother_ed:
1    3242
2    2876
3    1527
0     662
dtype: int64

There are 5905 null values for mother_ed

Secondary diagnosis

In [27]:
demographic.shape
Out[27]:
(14212, 48)
In [28]:
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
In [29]:
demographic.secondary_diagnosis.value_counts()
Out[29]:
0    10371
1     2394
dtype: int64
In [30]:
demographic.secondary_diagnosis.mean()
Out[30]:
0.18754406580493538

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [31]:
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3410 null values for premature_weeks
In [32]:
demographic.premature_weeks.value_counts()
Out[32]:
0     9214
2      554
4      358
12     193
6      178
10     148
8      112
14      42
16       3
dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [33]:
demographic.tech_ad.value_counts()
Out[33]:
1    4813
0    4212
7    1462
5     970
2     474
6     413
8      69
9      57
3      27
4      25
dtype: int64
In [34]:
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
In [35]:
demographic.tech_left.value_counts()
Out[35]:
2    6360
3    4272
0    1789
4      56
1      19
dtype: int64
In [36]:
demographic.tech_right.value_counts()
Out[36]:
2    6282
3    4694
0    1462
4      57
1      27
dtype: int64

Substitute valid missing values for hearing loss:

In [37]:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [38]:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

  • 0=none
  • 1=one ear
  • 2=both ears.
In [39]:
demographic.columns
Out[39]:
Index(['redcap_event_name', 'academic_year', 'hl', 'male', 'race', 'prim_lang',
       'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp',
       'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2',
       'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad',
       'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as',
       'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled',
       'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv',
       'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses',
       'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed',
       'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left',
       'degree_hl'],
      dtype='object')
In [40]:
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad:
0    4384
1       4
2       2
dtype: int64
There are 1639 null values for OAD

hearing_aid:
2    2031
0    1593
1     737
dtype: int64
There are 1690 null values for hearing_aid

cochlear:
0    2868
2     897
1     625
dtype: int64
There are 1639 null values for cochlear
14212

Identify bilateral and bimodal individuals:

In [41]:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
In [42]:
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
Out[42]:
(3445, 5169, 1385, 2076)
In [43]:
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()
Out[43]:
unilateral_ci     625
bilateral_ci      897
bilateral_ha     2031
bimodal           375
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [44]:
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
In [45]:
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()
Out[45]:
6    5169
3    3445
4    1385
1     895
0     672
8      14
2      12
7       5
5       1
dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [46]:
demographic.onset_1.unique()
Out[46]:
array([   1. ,   18. ,   17. ,   22. ,    nan,    3. ,    2. ,    5. ,
         13. ,   36. ,   28. ,   21. ,   41. ,   26. ,    0. ,   67. ,
         20. ,   24. ,    4. ,   40. ,   60. ,   10. ,    6. ,   25. ,
          7. ,   27. ,   15. ,   35. ,   14. ,   42. ,   34. ,   12. ,
          9. ,   32. ,   50. ,    8. ,   33. ,   23. ,   11. ,   31. ,
         30. ,   49. ,   48. ,    1.5,   19. ,    2.5,   39. ,   52. ,
         16. ,   38. ,   29. ,   51. ,   46. ,   45. ,   54. ,   88. ,
         65. ,   44. ,   81. ,  116. ,   72. ,   57. ,   62. ,   43. ,
         78. ,   83. ,   61. ,  107. ,   64. ,   74. ,   37. ,   77. ,
         96. ,   97. ,   79. ,   47. ,   53. ,   59. ,   84. ,   95. ,
         80. ,    0.5,   58. ,   56. ,   86. ,   98. ,   85. ,   75. ,
        119. ,   66. ,   70. ,   63. ,  140. ,  126. ,  133. ,  103. ,
         87. ,   76. ,   55. ,   68. ,   92. ,   71. ,  154. ,   89. ,
        152. ])
In [47]:
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [48]:
demographic.age_diag.isnull().sum()
Out[48]:
3970
In [49]:
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
In [50]:
import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')

Child has another diagnosed disability

In [51]:
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
In [52]:
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [53]:
demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [54]:
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race:
0    7442
2    2374
1    1287
3    1006
6     691
8     519
7     240
4      64
5      28
dtype: int64
race:
0    7442
2    2374
4    1302
1    1287
3    1006
dtype: int64
There are 801 null values for race

Recode implant technology variables

In [55]:
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
In [ ]:
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
In [57]:
demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012',
                                   '0000-0000': np.nan}).str.replace('*', '-').unique()
Out[57]:
array(['2009-2010', '2010-2011', '2005-2006', nan, '2007-2008',
       '2006-2007', '2008-2009', '2003-2004', '2012-2013', '2013-2014',
       '2002-2003', '2011-2012', '2014-2015', 'June 2014', '2004-2005',
       '1997-1998', '2006-2007 ', '1998-1999', '2000-2001', '2001-2002',
       '2103-2014', '1999-2000', '2012', '2009-2011', '2015-2016',
       '1995-1996', '2014', '2013', '2011', '2010', '2009',
       '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 2010-2011',
       ' 2014-2015', '2012=2013', '2015', '2015-2015', '2009 - 2010',
       '2010 - 2011', '2011 - 2012', '2014-2105', '2014-2015 ', '65',
       '2005-2004', '2012 - 2013', '2014-205', '2013 - 2014', '2017-2015',
       '2014-1015', '2012-2013 '], dtype=object)
In [87]:
demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
                          '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
                          '2015-2015': '2014-2015', '2009-2011': '2009-2010',
                                   '0000-0000': np.nan}).str.replace('*', '-')

Removed entries that don't contain dashes

In [88]:
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 
                'academic_year'] = np.nan
In [92]:
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
In [93]:
demographic.age_amp.hist()
Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x114b379b0>

Cleaning expressive vocabulary dataset

We converted the expressive vocabulary dataset to "long" format:

In [94]:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
In [95]:
expressive.test_type.value_counts()
Out[95]:
EVT               3642
EOWPVT            2624
EOWPVT and EVT     147
dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [96]:
expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [97]:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [98]:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
Out[98]:
study_id redcap_event_name score test_type school age_test domain
0 0101-2003-0101 initial_assessment_arm_1 58 EOWPVT 0101 54 Expressive Vocabulary
2 0101-2003-0101 year_2_complete_71_arm_1 84 EOWPVT 0101 80 Expressive Vocabulary
5 0101-2003-0101 year_5_complete_71_arm_1 90 EOWPVT 0101 113 Expressive Vocabulary
14 0101-2004-0101 year_2_complete_71_arm_1 90 EOWPVT 0101 53 Expressive Vocabulary
15 0101-2004-0101 year_3_complete_71_arm_1 87 EOWPVT 0101 66 Expressive Vocabulary

Cleaning receptive vocabulary dataset

We converted the receptive vocabulary data table to "long" format:

In [99]:
receptive.columns
Out[99]:
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')
In [100]:
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [101]:
receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [102]:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
In [103]:
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 27 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [104]:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
Out[104]:
study_id redcap_event_name score test_type school age_test domain
2 0101-2003-0101 year_2_complete_71_arm_1 90 PPVT 0101 80 Receptive Vocabulary
5 0101-2003-0101 year_5_complete_71_arm_1 101 ROWPVT 0101 113 Receptive Vocabulary
9 0101-2003-0102 initial_assessment_arm_1 55 PPVT 0101 44 Receptive Vocabulary
10 0101-2003-0102 year_1_complete_71_arm_1 80 PPVT 0101 54 Receptive Vocabulary
11 0101-2003-0102 year_2_complete_71_arm_1 101 PPVT 0101 68 Receptive Vocabulary
In [105]:
receptive.study_id.unique().shape
Out[105]:
(2959,)

Merge datasets

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [106]:
test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [107]:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
In [108]:
lsl_dr.tail()
Out[108]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... sex known_synd synd_or_disab race age_test domain school score test_name test_type
36577 year_9_complete_71_arm_1 2012-2013 0 0 8 0 2 6 4 8 ... Female 0 0 4 136 Expressive Vocabulary 1147 92 NaN EVT
36578 year_9_complete_71_arm_1 2012-2013 0 0 8 0 2 6 4 8 ... Female 0 0 4 136 Receptive Vocabulary 1147 84 NaN PPVT
36579 year_9_complete_71_arm_1 2012-2013 0 0 0 0 3 6 6 9 ... Female 0 0 0 NaN NaN NaN NaN NaN NaN
36580 year_9_complete_71_arm_1 2013-2014 0 1 0 0 2 NaN 6 8 ... Male 0 0 0 185 Expressive Vocabulary 1147 125 NaN EVT
36581 year_9_complete_71_arm_1 2013-2014 0 1 0 0 2 NaN 6 8 ... Male 0 0 0 186 Receptive Vocabulary 1147 101 NaN PPVT

5 rows × 73 columns

In [109]:
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
Out[109]:
2013    6924
2012    6628
2014    5480
2011    5205
2010    4419
nan     3177
2009    2358
2008     825
2007     531
2006     343
2005     285
2004     173
2003      90
2002      47
2001      35
1998      16
1999      15
2000      12
2015      11
1997       6
1995       1
2103       1
dtype: int64
In [110]:
current_year_only = False

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
In [111]:
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
In [112]:
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [113]:
if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')
In [114]:
lsl_dr.shape
Out[114]:
(36582, 74)
In [115]:
lsl_dr.study_id.unique().shape
Out[115]:
(5440,)
In [116]:
demographic.study_id.unique().shape
Out[116]:
(5440,)

Convert score to floating-point number

In [117]:
lsl_dr.score = lsl_dr.score.astype(float)
In [119]:
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
In [120]:
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
In [121]:
lsl_dr.domain.dropna().unique()
Out[121]:
array(['Articulation', 'Expressive Vocabulary', 'Language',
       'Receptive Vocabulary'], dtype=object)
In [122]:
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
Out[122]:
tech_class
Bilateral CI    0.43
Bilateral HA    0.58
Bimodal         0.51
Name: prim_lang, dtype: float64
In [123]:
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
In [124]:
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
Out[124]:
tech_class
Bilateral CI    0.08
Bilateral HA    0.86
Bimodal         0.30
Name: non_profound, dtype: float64
In [125]:
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
    plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
    plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
    ax.set_ylim(40, 120)
    ax.set_xticks(range(2,7))
    ax.set_title(dom)
In [126]:
lsl_dr.pivot_table?

Plots of Demographic Data

In [127]:
plot_color = "#64AAE8"
In [128]:
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index(1)
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()
In [129]:
unique_students = demographic.drop_duplicates('study_id')
In [130]:
unique_students.shape
Out[130]:
(5440, 67)
In [131]:
unique_students.age.describe()
Out[131]:
count    4949.000000
mean       30.353708
std        28.072288
min         0.000000
25%         9.000000
50%        25.000000
75%        41.000000
max       298.000000
Name: age, dtype: float64
In [132]:
plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)
In [133]:
plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)
In [134]:
unique_students.prim_lang.count()
Out[134]:
4889
In [135]:
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)
In [136]:
unique_students.sib.count()
Out[136]:
4515
In [137]:
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4857 null values for age_amp
In [138]:
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))
In [139]:
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))
In [140]:
age_amp_counts.sum()
Out[140]:
3316
In [141]:
unique_students.age_amp.max()
Out[141]:
666.0
In [142]:
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
Out[142]:
<matplotlib.text.Text at 0x10f927e80>
In [143]:
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
In [144]:
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
In [145]:
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)
In [146]:
unique_students.tech_right.count()
Out[146]:
4361
In [147]:
unique_students.tech_left.count()
Out[147]:
4352
In [148]:
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
In [149]:
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
In [150]:
unique_students.degree_hl_as.count()
Out[150]:
4265
In [151]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
In [152]:
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
In [153]:
unique_students.type_hl_ad.count()
Out[153]:
4201
In [154]:
unique_students.type_hl_as.count()
Out[154]:
4286
In [155]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)
In [156]:
demographic[demographic.study_id=='1147-2010-0064']
Out[156]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... bilateral_ci bilateral_ha bimodal tech implant_category age_diag sex known_synd synd_or_disab race
13328 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0
13329 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0
13330 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0
13331 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0

4 rows × 67 columns

In [157]:
receptive[receptive.study_id=='1147-2010-0064']
Out[157]:
study_id redcap_event_name score test_type school age_test domain
13328 1147-2010-0064 initial_assessment_arm_1 96 PPVT 1147 63 Receptive Vocabulary
13329 1147-2010-0064 year_1_complete_71_arm_1 91 PPVT 1147 73 Receptive Vocabulary
13330 1147-2010-0064 year_2_complete_71_arm_1 93 PPVT 1147 85 Receptive Vocabulary
In [158]:
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
Out[158]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... age_test domain school score test_name test_type academic_year_start tech_class age_year non_profound
8588 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 63 Expressive Vocabulary 1147 91 NaN EVT 2010 Bilateral HA 4 True
8589 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 63 Receptive Vocabulary 1147 96 NaN PPVT 2010 Bilateral HA 4 True
8590 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 59 Language 1147 101 PLS receptive 2010 Bilateral HA 4 True
8591 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 59 Language 1147 87 PLS expressive 2010 Bilateral HA 4 True
19250 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... 72 Expressive Vocabulary 1147 86 NaN EVT 2011 Bilateral HA 4 True
19251 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... 73 Receptive Vocabulary 1147 91 NaN PPVT 2011 Bilateral HA 4 True
20546 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... 88 Expressive Vocabulary 1147 95 NaN EVT 2012 Bilateral HA 4 True
20547 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... 85 Receptive Vocabulary 1147 93 NaN PPVT 2012 Bilateral HA 4 True
29121 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... NaN NaN NaN NaN NaN NaN 2013 Bilateral HA 4 True

9 rows × 77 columns

In [159]:
unique_students.type_hl_ad.count()
Out[159]:
4201
In [160]:
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[160]:
(2959,)
In [161]:
demographic.study_id.unique().shape
Out[161]:
(5440,)
In [162]:
receptive.study_id.unique().shape
Out[162]:
(2959,)
In [163]:
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[163]:
(2959,)
In [164]:
receptive_ids = receptive.study_id.unique()
In [165]:
demographic_ids = demographic.study_id.unique()
In [166]:
[s for s in receptive_ids if s not in demographic_ids]
Out[166]:
[]
In [167]:
def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    summary.index = summary.index.values.astype(int)
    return summary[['Sample Size','Mean','SD','Min','Max']]
In [168]:
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Out[168]:
Sample Size Mean SD Min Max
2 395 93.101266 18.115391 40 144
3 1363 91.903155 19.384986 0 150
4 1497 90.636607 20.373490 0 149
5 1122 89.802139 18.191470 0 142
6 620 85.562903 16.478926 40 154
7 411 82.951338 15.993827 40 130
8 292 80.458904 17.713500 20 132
9 216 77.805556 17.770086 25 160
10 185 75.875676 17.296981 20 123
11 449 78.632517 19.118076 20 134
In [169]:
receptive_summary.describe()
Out[169]:
Sample Size Mean SD Min Max
count 10.000000 10.000000 10.000000 10.0000 10.000000
mean 655.000000 84.673006 18.043673 20.5000 141.800000
std 488.137503 6.392185 1.324983 16.4063 11.802071
min 185.000000 75.875676 15.993827 0.0000 123.000000
25% 317.750000 79.089114 17.401111 5.0000 132.500000
50% 430.000000 84.257121 17.942739 20.0000 143.000000
75% 996.500000 90.427990 18.886424 36.2500 149.750000
max 1497.000000 93.101266 20.373490 40.0000 160.000000
In [170]:
receptive_summary['Sample Size'].sum()
Out[170]:
6550
In [171]:
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
Out[171]:
<matplotlib.text.Text at 0x10f8e2a58>
In [172]:
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Out[172]:
Sample Size Mean SD Min Max
2 375 92.362667 22.025792 23 141
3 1310 93.083206 21.785851 0 145
4 1473 92.156144 21.912841 0 146
5 1097 91.375570 20.134114 0 145
6 618 86.344660 18.457169 20 146
7 414 83.814010 15.712203 38 131
8 283 83.893993 16.552176 34 122
9 202 81.321782 16.143647 36 145
10 179 81.564246 15.327513 40 122
11 450 84.935556 17.521297 18 146
In [173]:
expressive_summary['Sample Size'].sum()
Out[173]:
6401
In [174]:
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 400)
else:
    plt.ylim(0, 1400)
In [175]:
articulation_summary = score_summary("Articulation")
articulation_summary
Out[175]:
Sample Size Mean SD Min Max
2 282 85.124113 15.168643 50 122
3 1131 83.458002 18.272973 40 126
4 1309 83.409473 20.701802 0 121
5 1026 83.880117 35.350375 39 999
6 583 78.895369 21.792075 39 112
7 389 80.095116 51.718446 3 999
8 244 78.959016 21.168498 40 107
9 169 81.118343 20.524903 40 108
10 131 80.954198 20.111177 40 105
11 305 81.947541 19.283989 39 105
In [176]:
articulation_summary['Sample Size'].sum()
Out[176]:
5569
In [178]:
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);

Language scores

In [179]:
lsl_dr.domain.unique()
Out[179]:
array([nan, 'Articulation', 'Expressive Vocabulary', 'Language',
       'Receptive Vocabulary'], dtype=object)
In [180]:
lsl_dr.test_type.unique()
Out[180]:
array([nan, 'Goldman', 'EVT', 'receptive', 'expressive', 'EOWPVT',
       'ROWPVT', 'PPVT', 'EOWPVT and EVT', 'Arizonia',
       'Arizonia and Goldman', 'PPVT and ROWPVT'], dtype=object)
In [181]:
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Out[181]:
Sample Size Mean SD Min Max
2 932 85.894850 21.996331 50 150
3 1315 84.855513 19.730693 50 144
4 1301 85.104535 19.598740 43 145
5 930 83.733333 18.762635 47 140
6 479 77.805846 17.642483 11 127
7 318 75.877358 18.713363 40 123
8 195 74.641026 19.685003 40 123
9 52 69.846154 20.649631 40 109
10 42 77.000000 20.167591 40 119
11 68 75.602941 21.490810 40 139
In [182]:
receptive_language_summary['Sample Size'].sum()
Out[182]:
5632
In [183]:
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [184]:
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Out[184]:
Sample Size Mean SD Min Max
2 926 88.035637 18.268384 50 150
3 1316 82.285714 17.541316 20 147
4 1293 80.304718 19.572599 45 141
5 948 78.530591 20.053469 45 144
6 496 71.512097 19.214889 6 140
7 338 66.789941 20.660322 40 124
8 200 67.520000 21.275168 40 118
9 51 64.725490 20.570929 40 106
10 42 74.595238 23.463578 40 119
11 67 73.417910 22.766369 40 132
In [185]:
expressive_language_summary['Sample Size'].sum()
Out[185]:
5677
In [186]:
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [187]:
(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')
Out[187]:
<matplotlib.text.Text at 0x10f933d30>
In [188]:
(unique_students.age/12.).describe()
Out[188]:
count    4949.000000
mean        2.529476
std         2.339357
min         0.000000
25%         0.750000
50%         2.083333
75%         3.416667
max        24.833333
Name: age, dtype: float64
In [189]:
def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
In [190]:
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
In [191]:
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
Out[191]:
<matplotlib.text.Text at 0x10f8dd5c0>
In [192]:
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
In [193]:
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
Out[193]:
<matplotlib.text.Text at 0x110ea1da0>
In [194]:
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
In [195]:
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
Out[195]:
<matplotlib.text.Text at 0x10f94ab00>
In [196]:
lsl_dr.degree_hl.dropna().value_counts()
Out[196]:
6    16633
4     4334
3     4159
5     4058
2     1653
0     1270
1      281
dtype: int64
In [197]:
ax = lsl_dr.degree_hl.hist(bins=7)
In [198]:
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
Out[198]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e5722b0>
In [199]:
(lsl_dr.age_int<6).mean()
Out[199]:
0.19320977529932754
In [200]:
(lsl_dr.age<6).mean()
Out[200]:
0.12798644141927723

Counts by year

In [201]:
lsl_dr.groupby('study_id').first()
Out[201]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... age_test domain school score test_name test_type academic_year_start tech_class age_year non_profound
study_id
00624-2010-0049 initial_assessment_arm_1 2013-2014 0 1 2 0 NaN 6 6 9 ... 24 Language 0062 50 PLS receptive 2013 Bilateral CI 2 False
0101-2003-0101 initial_assessment_arm_1 2002-2003 0 0 0 0 1 6 6 9 ... 54 Expressive Vocabulary 0101 58 PLS EOWPVT 2002 Bimodal 4 False
0101-2003-0102 initial_assessment_arm_1 2003-2004 0 0 0 0 1 2 2 8 ... 44 Articulation 0101 72 PLS Goldman 2003 Bilateral HA 3 True
0101-2004-0101 initial_assessment_arm_1 2006-2007 0 1 0 0 0 6 6 8 ... 37 Receptive Vocabulary 0101 62 PLS PPVT 2006 Bimodal 2 True
0101-2004-0102 initial_assessment_arm_1 2004-2005 0 0 0 0 1 5 6 9 ... NaN NaN NaN NaN NaN NaN 2004 Bimodal 0 True
0101-2004-0103 initial_assessment_arm_1 2012-2013 0 1 0 0 1 4 4 8 ... 96 Expressive Vocabulary 0101 104 CELF-4 EVT 2012 Bilateral CI 0 False
0101-2004-0104 initial_assessment_arm_1 2004-2005 0 1 0 0 1 6 6 8 ... 32 Articulation 0101 84 PLS Goldman 2004 Bilateral HA 0 True
0101-2004-0105 initial_assessment_arm_1 2004-2005 0 0 0 0 2 6 6 9 ... 47 Articulation 0101 78 CELF-P2 Goldman 2004 Bimodal 2 False
0101-2005-0101 initial_assessment_arm_1 2006-2007 0 1 0 0 2 5 4 8 ... 28 Articulation 0101 61 PLS Goldman 2006 Bilateral HA 2 True
0101-2005-0102 initial_assessment_arm_1 2004-2005 0 1 0 0 2 3 2 9 ... 63 Articulation 0101 87 CELF-P2 Goldman 2004 Bilateral HA 4 True
0101-2006-0101 initial_assessment_arm_1 2005-2006 0 0 0 0 1 6 6 8 ... NaN NaN NaN NaN NaN NaN 2005 Bimodal 0 False
0101-2006-0104 initial_assessment_arm_1 2006-2007 0 0 0 0 0 5 5 9 ... NaN NaN NaN NaN NaN NaN 2006 Bilateral CI 2 False
0101-2007-0104 initial_assessment_arm_1 2007-2008 0 0 0 0 1 4 6 9 ... 41 Articulation 0101 122 NaN Goldman 2007 Bimodal 4 True
0101-2007-0105 initial_assessment_arm_1 2007-2008 0 1 0 0 0 6 6 9 ... NaN NaN NaN NaN NaN NaN 2007 Bimodal 11 False
0101-2007-0107 initial_assessment_arm_1 2005-2006 0 0 0 0 1 4 4 8 ... NaN NaN NaN NaN NaN NaN 2005 Bilateral HA 0 True
0101-2008-0102 initial_assessment_arm_1 2008-2009 0 0 0 0 1 6 6 9 ... NaN NaN NaN NaN NaN NaN 2008 Bimodal 14 False
0101-2008-0106 initial_assessment_arm_1 2007-2008 0 0 0 0 1 4 4 8 ... NaN NaN NaN NaN NaN NaN 2007 Bilateral HA 0 True
0101-2009-0101 initial_assessment_arm_1 2008-2009 0 0 0 0 1 6 6 9 ... NaN NaN NaN NaN NaN NaN 2008 Bimodal 6 False
0101-2010-0101 initial_assessment_arm_1 2008-2009 0 1 0 0 1 6 6 9 ... 104 Articulation 0101 90 CELF-4 Arizonia 2008 Bilateral HA 8 True
0101-2010-0103 initial_assessment_arm_1 2010-2011 0 0 0 0 2 4 3 8 ... 25 Language 0101 63 PLS receptive 2010 Bilateral HA 0 False
0101-2010-0104 initial_assessment_arm_1 2010-2011 0 1 3 0 1 2 2 8 ... 30 Expressive Vocabulary 0101 90 PLS EOWPVT 2010 Bilateral HA 0 False
0101-2010-0105 initial_assessment_arm_1 2011-2012 0 1 0 0 0 5 6 6 ... 30 Language 0101 66 PLS receptive 2011 Bilateral CI 2 False
0101-2012-0101 initial_assessment_arm_1 2013-2014 0 1 0 0 0 6 6 9 ... NaN NaN NaN NaN NaN NaN 2013 Bimodal 2 False
0101-2013-0101 initial_assessment_arm_1 2012-2013 0 1 0 0 0 3 2 8 ... 12 Language 0101 58 PLS receptive 2012 Bilateral HA 0 True
0101-2013-0103 initial_assessment_arm_1 2012-2013 0 1 0 0 2 6 6 9 ... NaN NaN NaN NaN NaN NaN 2012 Bilateral CI 4 False
0101-2013-0104 initial_assessment_arm_1 2012-2013 0 1 0 0 1 4 4 8 ... 12 Language 0101 83 PLS receptive 2012 Bilateral HA 0 True
0101-2013-0112 initial_assessment_arm_1 2012-2013 0 1 0 0 1 3 6 9 ... 11 Language 0101 90 PLS receptive 2012 Bilateral HA 0 True
0101-2013-0113 initial_assessment_arm_1 2013-2014 0 0 0 0 1 2 2 8 ... 4 Language 0101 96 PLS receptive 2013 Bimodal 0 True
0101-2013-0114 initial_assessment_arm_1 2013-2014 0 1 0 0 0 3 3 8 ... 6 Language 0101 50 PLS receptive 2013 Bimodal 0 True
0101-2013-0115 initial_assessment_arm_1 2013-2014 0 1 0 0 2 2 2 8 ... 11 Language 0101 79 PLS receptive 2013 Bilateral HA 0 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1151-2012-0008 initial_assessment_arm_1 NaN 0 0 2 1 2 4 2 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2012-0009 initial_assessment_arm_1 NaN 0 1 2 1 1 4 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 1 True
1151-2012-0010 initial_assessment_arm_1 NaN 0 1 2 1 1 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 2 True
1151-2012-0011 initial_assessment_arm_1 NaN 0 1 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 4 False
1151-2012-0012 initial_assessment_arm_1 NaN 0 1 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 5 False
1151-2012-0013 initial_assessment_arm_1 NaN 0 0 2 1 0 2 2 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 5 False
1151-2012-0014 initial_assessment_arm_1 NaN 0 0 2 1 2 1 1 NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False
1151-2013-0001 initial_assessment_arm_1 NaN 0 0 2 1 0 2 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 4 False
1151-2013-0002 initial_assessment_arm_1 NaN 0 0 2 1 2 2 3 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2013-0003 initial_assessment_arm_1 NaN 0 0 2 1 0 1 1 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2013-0004 initial_assessment_arm_1 NaN 0 0 2 1 2 0 1 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2013-0005 initial_assessment_arm_1 NaN 0 1 2 1 0 1 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2013-0006 initial_assessment_arm_1 NaN 0 0 2 1 0 1 1 4 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2013-0007 initial_assessment_arm_1 NaN 0 1 2 1 0 4 2 NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False
1151-2013-0008 initial_assessment_arm_1 NaN 0 1 2 1 0 3 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2013-0009 initial_assessment_arm_1 NaN 0 1 2 1 0 2 6 NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False
1151-2013-0010 initial_assessment_arm_1 NaN 0 1 2 1 0 4 4 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 6 False
1151-2013-0011 initial_assessment_arm_1 NaN 0 1 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 1 False
1151-2013-0012 initial_assessment_arm_1 NaN 0 1 2 1 2 3 3 5 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2014-0001 initial_assessment_arm_1 NaN 0 0 2 1 3 6 4 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2014-0002 initial_assessment_arm_1 NaN 0 1 2 1 1 2 4 7 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2014-0003 initial_assessment_arm_1 NaN 0 0 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2014-0004 initial_assessment_arm_1 NaN 0 1 2 1 1 2 1 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2014-0005 initial_assessment_arm_1 NaN 0 0 2 1 3 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 4 False
1151-2014-0006 initial_assessment_arm_1 NaN 0 1 2 1 0 4 0 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 3 False
1151-2014-0007 initial_assessment_arm_1 NaN 0 0 2 1 2 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 4 False
1151-2014-0008 initial_assessment_arm_1 NaN 0 0 2 1 1 1 1 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 5 False
1151-2014-0009 initial_assessment_arm_1 NaN 0 1 2 1 2 4 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 6 True
1151-2014-0010 initial_assessment_arm_1 NaN 0 1 2 1 1 2 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 6 False
9308-2015-0002 initial_assessment_arm_1 NaN 0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False

5440 rows × 76 columns

In [202]:
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');
In [203]:
disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)
Out[203]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f9f9198>