In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)
In [3]:
metadata = lsl_dr_project.export_metadata()
In [4]:
# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
In [6]:
records = lsl_dr_project.export_records(fields=articulation_fields)
In [7]:
print(records[0]['study_id'])
0101-2003-0101
In [8]:
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})
In [9]:
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})
In [10]:
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})
In [11]:
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})
In [12]:
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
Out[12]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
13565 1147-2010-0064 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 ... 3 6 65 0 NaN NaN NaN NaN NaN NaN
13566 1147-2010-0064 year_1_complete_71_arm_1 2011-2012 0 NaN NaN NaN NaN NaN NaN ... 3 5 77 2 NaN NaN NaN NaN NaN NaN
13567 1147-2010-0064 year_2_complete_71_arm_1 2012-2013 0 NaN NaN NaN NaN NaN NaN ... 3 5 89 2 NaN NaN NaN NaN NaN NaN
13568 1147-2010-0064 year_3_complete_71_arm_1 2013-2014 0 NaN NaN NaN NaN NaN NaN ... 4 5 101 2 NaN NaN NaN NaN NaN NaN

4 rows × 46 columns

Attendance information

Several fields in the demographic data have missing values.

In [13]:
demographic_raw.head()
Out[13]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
0 0101-2003-0101 initial_assessment_arm_1 2002-2003 0 0 0 0 1 6 6 ... 2 2 54 2 NaN NaN NaN NaN NaN NaN
1 0101-2003-0101 year_1_complete_71_arm_1 2003-2004 0 NaN NaN NaN NaN NaN NaN ... 4 4 80 1 NaN NaN NaN NaN NaN NaN
2 0101-2003-0101 year_2_complete_71_arm_1 2004-2005 0 NaN NaN NaN NaN NaN NaN ... 4 4 80 2 NaN NaN NaN NaN NaN NaN
3 0101-2003-0101 year_3_complete_71_arm_1 2005-2006 0 NaN NaN NaN NaN NaN NaN ... 5 5 96 3 NaN NaN NaN NaN NaN NaN
4 0101-2003-0101 year_4_complete_71_arm_1 2006-2007 0 NaN NaN NaN NaN NaN NaN ... 5 5 109 2 NaN NaN NaN NaN NaN NaN

5 rows × 46 columns

We can fill missing values forward from previous observation (by study_id)

In [14]:
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()

Random check to make sure this worked

In [15]:
demographic[demographic.study_id=='1147-2010-0064']
Out[15]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
13565 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 6 65 0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
13566 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... 5 77 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064
13567 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... 5 89 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064
13568 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... 5 101 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064

4 rows × 46 columns

Demographic data without missing values:

In [16]:
demographic.head()
Out[16]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
0 initial_assessment_arm_1 2002-2003 0 0 0 0 1 6 6 9 ... 2 54 2 NaN NaN NaN NaN NaN NaN 0101-2003-0101
7486 initial_assessment_arm_1 2013-2014 0 0 1 0 0 2 2 8 ... 1 7 3 NaN NaN NaN NaN NaN NaN 0626-2014-0035
7484 initial_assessment_arm_1 2014-2015 0 1 6 0 1 4 4 8 ... 3 56 1 NaN NaN NaN NaN NaN NaN 0626-2014-0034
7483 initial_assessment_arm_1 2014-2015 0 1 3 0 1 4 5 8 ... 0 29 2 NaN NaN NaN NaN NaN NaN 0626-2014-0033
7482 initial_assessment_arm_1 2014-2015 0 1 0 0 1 3 5 8 ... 1 11 1 NaN NaN NaN NaN NaN NaN 0626-2014-0032

5 rows × 46 columns

Cleaning languge dataset

5 language measures:

  • 3 versions of CELF
  • PLS
    • pls_ac_rs: PLS: Auditory Comprehension Raw Score
    • pls_ac_ss: PLS: Auditory Comprehension Standard Score
    • pls_ec_rs: PLS: Expressive Communication Raw Score
    • pls_ec_ss: PLS: Expressive Communication Standard Score
    • pls_tl_rs: PLS: Total Language Score Standard Score Total
    • pls_tl_ss: PLS: Total Language Score Standard Score
  • OWLS
    • age_test_owls: Age at time of testing (OWLS)
    • owls_lc_rs: OWLS: Listening Comprehension Raw Score
    • owls_lc_ss: OWLS: Listening Comprehension Standard Score
    • owls_oe_rs: OWLS: Oral Expression Raw Score
    • owls_oe_ss: OWLS: Oral Expression Standard Score
    • owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
    • owls_oc_ss: OWLS: Oral Composite Standard Score
    • owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
    • owls_wes_as: OWLS: Written Expression Scale Ability Score
    • owls_wes_ss: OWLS: Written Expression Scale Standard Score
    • owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
    • owls_lcss: OWLS: Language Composite Standard Score
In [17]:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type  expressive  receptive
test_name                       
CELF-4            593        525
CELF-P2          1374       1379
OWLS             1065       1072
PLS              3387       3397
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [18]:
language["school"] = language.study_id.str.slice(0,4)
In [19]:
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
Out[19]:
study_id redcap_event_name score test_type test_name school age_test domain
0 0101-2003-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language
5 0101-2003-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language

Cleaning articulation dataset

We converted the articulation dataset into a "long" format:

In [20]:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman                 5098
Arizonia                 498
Arizonia and Goldman      73
Name: test_type, dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [21]:
articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [22]:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count    5666.000000
mean       68.598835
std        30.694788
min        23.000000
25%        47.000000
50%        60.000000
75%        80.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [23]:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
Out[23]:
study_id redcap_event_name test_type score school age_test domain
1 0101-2003-0101 year_1_complete_71_arm_1 Goldman 78 0101 80 Articulation
9 0101-2003-0102 initial_assessment_arm_1 Goldman 72 0101 44 Articulation
10 0101-2003-0102 year_1_complete_71_arm_1 Goldman 97 0101 54 Articulation
14 0101-2004-0101 year_2_complete_71_arm_1 Goldman 75 0101 53 Articulation
15 0101-2004-0101 year_3_complete_71_arm_1 Goldman 80 0101 66 Articulation

Cleaning demographic dataset

We excluded unwanted columns and rows for which age, gender or race were missing:

In [24]:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [25]:
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False    11198
True      2478
Name: non_english, dtype: int64
There are 691 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

  • 0=no high school diploma
  • 1=high school
  • 2=undergraduate
  • 3=graduate

Category 6 (unknown) was recoded as missing.

In [26]:
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed:
6    5001
4    2921
3    1950
5    1545
2    1342
1     474
0     194
Name: _mother_ed, dtype: int64
mother_ed:
1    3292
2    2921
3    1545
0     668
Name: mother_ed, dtype: int64

There are 5941 null values for mother_ed

Secondary diagnosis

In [27]:
demographic.shape
Out[27]:
(14367, 48)
In [28]:
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
In [29]:
demographic.secondary_diagnosis.value_counts()
Out[29]:
0    10492
1     2416
Name: secondary_diagnosis, dtype: int64
In [30]:
demographic.secondary_diagnosis.mean()
Out[30]:
0.18717074682367524

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [31]:
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3437 null values for premature_weeks
In [32]:
demographic.premature_weeks.value_counts()
Out[32]:
0     9331
2      560
4      356
12     195
6      181
10     149
8      113
14      42
16       3
Name: premature_weeks, dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [33]:
demographic.tech_ad.value_counts()
Out[33]:
1     4853
0     4246
7     1475
5      988
2      481
6      414
8       71
9       58
3       27
4       25
10       2
Name: tech_ad, dtype: int64
In [34]:
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
In [35]:
demographic.tech_left.value_counts()
Out[35]:
2    6423
3    4309
0    1802
4      57
1      19
Name: tech_left, dtype: int64
In [36]:
demographic.tech_right.value_counts()
Out[36]:
2    6349
3    4731
0    1475
4      58
1      27
Name: tech_right, dtype: int64

Substitute valid missing values for hearing loss:

In [37]:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [38]:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

  • 0=none
  • 1=one ear
  • 2=both ears.
In [39]:
demographic.columns
Out[39]:
Index(['redcap_event_name', 'academic_year', 'hl', 'male', 'race', 'prim_lang',
       'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp',
       'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2',
       'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad',
       'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as',
       'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled',
       'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv',
       'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses',
       'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed',
       'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left',
       'degree_hl'],
      dtype='object')
In [40]:
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad:
0    4417
1       4
2       2
Name: oad, dtype: int64
There are 1674 null values for OAD

hearing_aid:
2    2048
0    1604
1     741
Name: hearing_aid, dtype: int64
There are 1727 null values for hearing_aid

cochlear:
0    2894
2     903
1     626
Name: cochlear, dtype: int64
There are 1674 null values for cochlear
14367

Identify bilateral and bimodal individuals:

In [41]:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
In [42]:
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
Out[42]:
(3479, 5224, 1387, 2082)
In [43]:
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()
Out[43]:
unilateral_ci     626
bilateral_ci      903
bilateral_ha     2048
bimodal           375
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [44]:
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
In [45]:
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()
Out[45]:
6    5224
3    3479
4    1387
1     911
0     676
8      14
2      12
7       5
5       1
Name: implant_category, dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [46]:
demographic.onset_1.unique()
Out[46]:
array([  15. ,    1. ,   37. ,   10. ,    0. ,   19. ,    6. ,   33. ,
         26. ,    2. ,   60. ,   16. ,   50. ,   39. ,   28. ,   17. ,
          4. ,   18. ,    nan,    3. ,   35. ,   38. ,   95. ,   42. ,
          7. ,   13. ,   12. ,   31. ,   14. ,   27. ,   11. ,   36. ,
         41. ,   22. ,   24. ,   51. ,   84. ,   61. ,    5. ,   30. ,
         88. ,   46. ,   23. ,   80. ,    9. ,    8. ,   83. ,   74. ,
         25. ,   64. ,  107. ,   21. ,   72. ,  116. ,   40. ,   57. ,
         78. ,   65. ,   43. ,   47. ,   79. ,   34. ,   62. ,   77. ,
         48. ,   96. ,   52. ,   97. ,   67. ,   20. ,   45. ,   29. ,
         59. ,   53. ,    1.5,   81. ,   55. ,   54. ,   49. ,   70. ,
         58. ,   44. ,   32. ,   71. ,   63. ,  140. ,   66. ,   87. ,
         76. ,   68. ,   92. ,   86. ,  126. ,   85. ,  133. ,  103. ,
         56. ,  119. ,    2.5,   98. ,   75. ,    0.5,  152. ,   89. ,
        154. ])
In [47]:
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [48]:
demographic.age_diag.isnull().sum()
Out[48]:
3994
In [49]:
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
In [50]:
import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')

Child has another diagnosed disability

In [51]:
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
In [52]:
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [53]:
demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [54]:
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race:
0    7531
2    2412
1    1300
3    1011
6     698
8     521
7     242
4      65
5      28
Name: _race, dtype: int64
race:
0    7531
2    2412
4    1312
1    1300
3    1011
Name: race, dtype: int64
There are 801 null values for race

Recode implant technology variables

In [55]:
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
In [56]:
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
In [57]:
demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012',
                                   '0000-0000': np.nan}).str.replace('*', '-').unique()
Out[57]:
array(['2002-2003', '2013-2014', '2014-2015', '2012-2013', '2011-2012',
       '2009-2010', '2010-2011', '2007-2008', '2008-2009', nan,
       '2009-2011', '2006-2007', '2005-2006', '2012', '2006-2007 ',
       '2004-2005', '2003-2004', '2015-2016', '2015', '2014', '2001-2002',
       '2000-2001', '1995-1996', '1998-1999', '1999-2000', '1997-1998',
       '2013', '2010', '2009', '2011',
       '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 2010-2011',
       '2015-2015', '2014-2015 ', '2012-2013 '], dtype=object)
In [58]:
demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
                          '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
                          '2015-2015': '2014-2015', '2009-2011': '2009-2010',
                                   '0000-0000': np.nan}).str.replace('*', '-')

Removed entries that don't contain dashes

In [59]:
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 
                'academic_year'] = np.nan
In [60]:
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
In [61]:
demographic.age_amp.hist()
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fd84cc0>

Cleaning expressive vocabulary dataset

We converted the expressive vocabulary dataset to "long" format:

In [62]:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
In [63]:
expressive.test_type.value_counts()
Out[63]:
EVT               3691
EOWPVT            2657
EOWPVT and EVT     147
Name: test_type, dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [64]:
expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [65]:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [66]:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
Out[66]:
study_id redcap_event_name score test_type school age_test domain
0 0101-2003-0101 initial_assessment_arm_1 58 EOWPVT 0101 54 Expressive Vocabulary
2 0101-2003-0101 year_2_complete_71_arm_1 84 EOWPVT 0101 80 Expressive Vocabulary
5 0101-2003-0101 year_5_complete_71_arm_1 90 EOWPVT 0101 113 Expressive Vocabulary
14 0101-2004-0101 year_2_complete_71_arm_1 90 EOWPVT 0101 53 Expressive Vocabulary
15 0101-2004-0101 year_3_complete_71_arm_1 87 EOWPVT 0101 66 Expressive Vocabulary

Cleaning receptive vocabulary dataset

We converted the receptive vocabulary data table to "long" format:

In [67]:
receptive.columns
Out[67]:
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')
In [68]:
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [69]:
receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [70]:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
In [71]:
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 28 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [72]:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
Out[72]:
study_id redcap_event_name score test_type school age_test domain
2 0101-2003-0101 year_2_complete_71_arm_1 90 PPVT 0101 80 Receptive Vocabulary
5 0101-2003-0101 year_5_complete_71_arm_1 101 ROWPVT 0101 113 Receptive Vocabulary
9 0101-2003-0102 initial_assessment_arm_1 55 PPVT 0101 44 Receptive Vocabulary
10 0101-2003-0102 year_1_complete_71_arm_1 80 PPVT 0101 54 Receptive Vocabulary
11 0101-2003-0102 year_2_complete_71_arm_1 101 PPVT 0101 68 Receptive Vocabulary
In [73]:
receptive.study_id.unique().shape
Out[73]:
(3021,)

Merge datasets

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [74]:
test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [75]:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
In [76]:
lsl_dr.tail()
Out[76]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... sex known_synd synd_or_disab race age_test domain school score test_name test_type
36993 year_9_complete_71_arm_1 2011-2012 0 1 0 0 3 6 6 8 ... Male 0 0 0 162 Receptive Vocabulary 0102 84 NaN ROWPVT
36994 year_9_complete_71_arm_1 NaN 0 0 0 0 NaN 6 6 9 ... Female NaN NaN 0 NaN NaN NaN NaN NaN NaN
36995 year_9_complete_71_arm_1 2013-2014 0 1 3 0 1 5 5 8 ... Male 0 1 3 123 Articulation 1147 102 NaN Goldman
36996 year_9_complete_71_arm_1 2013-2014 0 1 3 0 1 5 5 8 ... Male 0 1 3 125 Expressive Vocabulary 1147 102 NaN EVT
36997 year_9_complete_71_arm_1 2013-2014 0 1 3 0 1 5 5 8 ... Male 0 1 3 123 Receptive Vocabulary 1147 95 NaN PPVT

5 rows × 73 columns

In [77]:
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
Out[77]:
2013    6928
2012    6630
2014    5560
2011    5216
2010    4425
nan     3157
2009    2362
2008     830
2007     531
2006     343
2015     336
2005     286
2004     172
2003      90
2002      47
2001      35
1998      16
1999      15
2000      12
1997       6
1995       1
Name: academic_year_start, dtype: int64
In [78]:
current_year_only = False

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
In [79]:
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
In [80]:
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [81]:
if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')
In [82]:
lsl_dr.shape
Out[82]:
(36998, 74)
In [83]:
lsl_dr.study_id.unique().shape
Out[83]:
(5511,)
In [84]:
demographic.study_id.unique().shape
Out[84]:
(5511,)

Convert score to floating-point number

In [85]:
lsl_dr.score = lsl_dr.score.astype(float)
In [86]:
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
In [87]:
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
In [88]:
lsl_dr.domain.dropna().unique()
Out[88]:
array(['Expressive Vocabulary', 'Language', 'Articulation',
       'Receptive Vocabulary'], dtype=object)
In [89]:
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
Out[89]:
tech_class
Bilateral CI    0.43
Bilateral HA    0.59
Bimodal         0.50
Name: prim_lang, dtype: float64
In [90]:
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
In [91]:
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
Out[91]:
tech_class
Bilateral CI    0.08
Bilateral HA    0.86
Bimodal         0.30
Name: non_profound, dtype: float64
In [92]:
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
    plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
    plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
    ax.set_ylim(40, 120)
    ax.set_xticks(range(2,7))
    ax.set_title(dom)
In [93]:
lsl_dr.pivot_table?

Plots of Demographic Data

In [94]:
plot_color = "#64AAE8"
In [161]:
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index()
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()
In [162]:
unique_students = demographic.drop_duplicates('study_id')
In [163]:
unique_students.shape
Out[163]:
(5511, 67)
In [164]:
unique_students.age.describe()
Out[164]:
count    5025.000000
mean       30.382886
std        27.944080
min         0.000000
25%         9.000000
50%        25.000000
75%        41.000000
max       298.000000
Name: age, dtype: float64
In [166]:
plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2800), color=plot_color)
In [167]:
plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)
In [168]:
unique_students.prim_lang.count()
Out[168]:
4964
In [169]:
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)
In [170]:
unique_students.sib.count()
Out[170]:
4587
In [171]:
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4883 null values for age_amp
In [172]:
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
In [173]:
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))
In [104]:
age_amp_counts.sum()
Out[104]:
3355
In [105]:
unique_students.age_amp.max()
Out[105]:
666.0
In [106]:
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
Out[106]:
<matplotlib.text.Text at 0x109c03518>
In [174]:
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
In [175]:
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
In [176]:
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)
In [108]:
unique_students.tech_right.count()
Out[108]:
4393
In [109]:
unique_students.tech_left.count()
Out[109]:
4384
In [177]:
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
In [178]:
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
In [179]:
unique_students.degree_hl_as.count()
Out[179]:
4298
In [180]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
In [181]:
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
In [113]:
unique_students.type_hl_ad.count()
Out[113]:
4236
In [114]:
unique_students.type_hl_as.count()
Out[114]:
4320
In [182]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)
In [183]:
demographic[demographic.study_id=='1147-2010-0064']
Out[183]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... bilateral_ci bilateral_ha bimodal tech implant_category age_diag sex known_synd synd_or_disab race
13565 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0
13566 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0
13567 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0
13568 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... False True False 0 6 51 Female 0 0 0

4 rows × 67 columns

In [184]:
receptive[receptive.study_id=='1147-2010-0064']
Out[184]:
study_id redcap_event_name score test_type school age_test domain
13565 1147-2010-0064 initial_assessment_arm_1 96 PPVT 1147 63 Receptive Vocabulary
13566 1147-2010-0064 year_1_complete_71_arm_1 91 PPVT 1147 73 Receptive Vocabulary
13567 1147-2010-0064 year_2_complete_71_arm_1 93 PPVT 1147 85 Receptive Vocabulary
In [116]:
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
Out[116]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... age_test domain school score test_name test_type academic_year_start tech_class age_year non_profound
5777 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 63 Expressive Vocabulary 1147 91 NaN EVT 2010 Bilateral HA 4 True
5778 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 63 Receptive Vocabulary 1147 96 NaN PPVT 2010 Bilateral HA 4 True
5779 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 59 Language 1147 101 PLS receptive 2010 Bilateral HA 4 True
5780 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 59 Language 1147 87 PLS expressive 2010 Bilateral HA 4 True
13901 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... 72 Expressive Vocabulary 1147 86 NaN EVT 2011 Bilateral HA 4 True
13902 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... 73 Receptive Vocabulary 1147 91 NaN PPVT 2011 Bilateral HA 4 True
21515 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... 88 Expressive Vocabulary 1147 95 NaN EVT 2012 Bilateral HA 4 True
21516 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... 85 Receptive Vocabulary 1147 93 NaN PPVT 2012 Bilateral HA 4 True
27748 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... NaN NaN NaN NaN NaN NaN 2013 Bilateral HA 4 True

9 rows × 77 columns

In [117]:
unique_students.type_hl_ad.count()
Out[117]:
4236
In [118]:
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[118]:
(3021,)
In [119]:
demographic.study_id.unique().shape
Out[119]:
(5511,)
In [120]:
receptive.study_id.unique().shape
Out[120]:
(3021,)
In [121]:
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[121]:
(3021,)
In [122]:
receptive_ids = receptive.study_id.unique()
In [123]:
demographic_ids = demographic.study_id.unique()
In [124]:
[s for s in receptive_ids if s not in demographic_ids]
Out[124]:
[]
In [125]:
def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    summary.index = summary.index.values.astype(int)
    return summary[['Sample Size','Mean','SD','Min','Max']]
In [126]:
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Out[126]:
Sample Size Mean SD Min Max
2 403 93.265509 18.062536 40 144
3 1397 92.099499 19.389263 0 150
4 1515 90.675248 20.314116 0 149
5 1128 89.859043 18.185493 0 142
6 624 85.669872 16.504194 40 154
7 413 83.053269 16.021892 40 130
8 295 80.610169 17.686631 20 132
9 218 77.885321 17.816542 25 160
10 188 76.303191 17.550191 20 123
11 450 78.668889 19.112355 20 134
In [127]:
receptive_summary.describe()
Out[127]:
Sample Size Mean SD Min Max
count 10.000000 10.000000 10.000000 10.0000 10.000000
mean 663.100000 84.809001 18.064321 20.5000 141.800000
std 496.314069 6.359775 1.291166 16.4063 11.802071
min 188.000000 76.303191 16.021892 0.0000 123.000000
25% 322.000000 79.154209 17.584301 5.0000 132.500000
50% 431.500000 84.361570 17.939539 20.0000 143.000000
75% 1002.000000 90.471196 18.880639 36.2500 149.750000
max 1515.000000 93.265509 20.314116 40.0000 160.000000
In [128]:
receptive_summary['Sample Size'].sum()
Out[128]:
6631
In [129]:
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
Out[129]:
<matplotlib.text.Text at 0x109bac2b0>
In [130]:
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Out[130]:
Sample Size Mean SD Min Max
2 383 92.488251 21.829641 23 141
3 1343 93.293373 21.700653 0 145
4 1492 92.269437 21.873205 0 146
5 1103 91.451496 20.127388 0 145
6 623 86.491172 18.464257 20 146
7 416 83.899038 15.723956 38 131
8 286 84.006993 16.518993 34 122
9 204 81.431373 16.195243 36 145
10 182 81.758242 15.388049 40 122
11 451 84.944568 17.502864 18 146
In [131]:
expressive_summary['Sample Size'].sum()
Out[131]:
6483
In [186]:
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 400)
else:
    plt.ylim(0, 1600)
In [133]:
articulation_summary = score_summary("Articulation")
articulation_summary
Out[133]:
Sample Size Mean SD Min Max
2 288 85.180556 15.086812 50 122
3 1167 83.655527 18.397543 40 126
4 1333 83.588147 20.725702 0 123
5 1032 83.908915 35.255688 39 999
6 589 79.049236 21.785893 39 112
7 391 80.191816 51.611731 3 999
8 248 79.084677 21.061047 40 107
9 172 81.412791 20.488435 40 108
10 134 81.052239 19.973786 40 105
11 310 84.835484 55.537870 39 999
In [134]:
articulation_summary['Sample Size'].sum()
Out[134]:
5664
In [135]:
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);

Language scores

In [136]:
lsl_dr.domain.unique()
Out[136]:
array(['Expressive Vocabulary', 'Language', nan, 'Articulation',
       'Receptive Vocabulary'], dtype=object)
In [137]:
lsl_dr.test_type.unique()
Out[137]:
array(['EOWPVT', 'receptive', 'expressive', nan, 'Goldman', 'EVT', 'PPVT',
       'Arizonia', 'ROWPVT', 'Arizonia and Goldman', 'EOWPVT and EVT',
       'PPVT and ROWPVT'], dtype=object)
In [138]:
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Out[138]:
Sample Size Mean SD Min Max
2 942 86.061571 22.053419 50 150
3 1336 84.869760 19.694166 50 144
4 1310 85.103817 19.572003 43 145
5 934 83.780514 18.783587 47 140
6 481 77.860707 17.628083 11 127
7 318 75.877358 18.713363 40 123
8 197 74.817259 19.682871 40 123
9 53 70.792453 21.579333 40 120
10 44 77.954545 20.185137 40 119
11 69 76.014493 21.604393 40 139
In [139]:
receptive_language_summary['Sample Size'].sum()
Out[139]:
5684
In [140]:
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [141]:
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Out[141]:
Sample Size Mean SD Min Max
2 936 88.157051 18.278298 50 150
3 1337 82.311892 17.566191 20 147
4 1303 80.346124 19.558155 45 141
5 952 78.564076 20.089026 45 144
6 499 71.647295 19.240286 6 140
7 338 66.789941 20.660322 40 124
8 202 67.787129 21.338290 40 118
9 52 65.557692 21.233911 40 108
10 44 75.750000 23.544243 40 119
11 68 73.794118 22.807801 40 132
In [142]:
expressive_language_summary['Sample Size'].sum()
Out[142]:
5731
In [143]:
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [145]:
(unique_students.age/12.).describe()
Out[145]:
count    5025.000000
mean        2.531907
std         2.328673
min         0.000000
25%         0.750000
50%         2.083333
75%         3.416667
max        24.833333
Name: age, dtype: float64
In [146]:
def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
In [147]:
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
In [148]:
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
Out[148]:
<matplotlib.text.Text at 0x109bee080>
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):
In [149]:
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
In [150]:
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
Out[150]:
<matplotlib.text.Text at 0x10aaa6da0>
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):
In [151]:
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
In [152]:
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
Out[152]:
<matplotlib.text.Text at 0x10aaa2908>
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):
In [153]:
lsl_dr.degree_hl.dropna().value_counts()
Out[153]:
6    16788
4     4377
3     4228
5     4090
2     1667
0     1267
1      285
Name: degree_hl, dtype: int64
In [154]:
ax = lsl_dr.degree_hl.hist(bins=7)
In [155]:
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
Out[155]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d4d0780>
In [156]:
(lsl_dr.age_int<6).mean()
Out[156]:
0.19517271203848857
In [157]:
(lsl_dr.age<6).mean()
Out[157]:
0.1296826855505703

Counts by year

In [158]:
lsl_dr.groupby('study_id').first()
Out[158]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... age_test domain school score test_name test_type academic_year_start tech_class age_year non_profound
study_id
0101-2003-0101 initial_assessment_arm_1 2002-2003 0 0 0 0 1 6 6 9 ... 54 Expressive Vocabulary 0101 58 PLS EOWPVT 2002 Bimodal 4 False
0101-2003-0102 initial_assessment_arm_1 2003-2004 0 0 0 0 1 2 2 8 ... 44 Articulation 0101 72 PLS Goldman 2003 Bilateral HA 3 True
0101-2004-0101 initial_assessment_arm_1 2006-2007 0 1 0 0 0 6 6 8 ... 37 Receptive Vocabulary 0101 62 PLS PPVT 2006 Bimodal 2 True
0101-2004-0102 initial_assessment_arm_1 2004-2005 0 0 0 0 1 5 6 9 ... NaN NaN NaN NaN NaN NaN 2004 Bimodal 0 True
0101-2004-0103 initial_assessment_arm_1 2012-2013 0 1 0 0 1 4 4 8 ... 96 Expressive Vocabulary 0101 104 CELF-4 EVT 2012 Bilateral CI 0 False
0101-2004-0104 initial_assessment_arm_1 2004-2005 0 1 0 0 1 6 6 8 ... 32 Articulation 0101 84 PLS Goldman 2004 Bilateral HA 0 True
0101-2004-0105 initial_assessment_arm_1 2004-2005 0 0 0 0 2 6 6 9 ... 47 Articulation 0101 78 CELF-P2 Goldman 2004 Bimodal 2 False
0101-2005-0101 initial_assessment_arm_1 2006-2007 0 1 0 0 2 5 4 8 ... 28 Articulation 0101 61 PLS Goldman 2006 Bilateral HA 2 True
0101-2005-0102 initial_assessment_arm_1 2004-2005 0 1 0 0 2 3 2 9 ... 63 Articulation 0101 87 CELF-P2 Goldman 2004 Bilateral HA 4 True
0101-2006-0101 initial_assessment_arm_1 2005-2006 0 0 0 0 1 6 6 8 ... NaN NaN NaN NaN NaN NaN 2005 Bimodal 0 False
0101-2006-0104 initial_assessment_arm_1 2006-2007 0 0 0 0 0 5 5 9 ... NaN NaN NaN NaN NaN NaN 2006 Bilateral CI 2 False
0101-2007-0104 initial_assessment_arm_1 2007-2008 0 0 0 0 1 4 6 9 ... 41 Articulation 0101 122 NaN Goldman 2007 Bimodal 4 True
0101-2007-0105 initial_assessment_arm_1 2007-2008 0 1 0 0 0 6 6 9 ... NaN NaN NaN NaN NaN NaN 2007 Bimodal 11 False
0101-2007-0107 initial_assessment_arm_1 2005-2006 0 0 0 0 1 4 4 8 ... NaN NaN NaN NaN NaN NaN 2005 Bilateral HA 0 True
0101-2008-0102 initial_assessment_arm_1 2008-2009 0 0 0 0 1 6 6 9 ... NaN NaN NaN NaN NaN NaN 2008 Bimodal 14 False
0101-2008-0106 initial_assessment_arm_1 2007-2008 0 0 0 0 1 4 4 8 ... NaN NaN NaN NaN NaN NaN 2007 Bilateral HA 0 True
0101-2009-0101 initial_assessment_arm_1 2008-2009 0 0 0 0 1 6 6 9 ... NaN NaN NaN NaN NaN NaN 2008 Bimodal 6 False
0101-2010-0101 initial_assessment_arm_1 2008-2009 0 1 0 0 1 6 6 9 ... 104 Articulation 0101 90 CELF-4 Arizonia 2008 Bilateral HA 8 True
0101-2010-0103 initial_assessment_arm_1 2010-2011 0 0 0 0 2 4 3 8 ... 25 Language 0101 63 PLS receptive 2010 Bilateral HA 0 False
0101-2010-0104 initial_assessment_arm_1 2010-2011 0 1 3 0 1 2 2 8 ... 30 Expressive Vocabulary 0101 90 PLS EOWPVT 2010 Bilateral HA 0 False
0101-2010-0105 initial_assessment_arm_1 2011-2012 0 1 0 0 0 5 6 6 ... 30 Language 0101 66 PLS receptive 2011 Bilateral CI 2 False
0101-2012-0101 initial_assessment_arm_1 2013-2014 0 1 0 0 0 6 6 9 ... NaN NaN NaN NaN NaN NaN 2013 Bimodal 2 False
0101-2013-0101 initial_assessment_arm_1 2012-2013 0 1 0 0 0 3 2 8 ... 12 Language 0101 58 PLS receptive 2012 Bilateral HA 0 True
0101-2013-0103 initial_assessment_arm_1 2012-2013 0 1 0 0 2 6 6 9 ... NaN NaN NaN NaN NaN NaN 2012 Bilateral CI 4 False
0101-2013-0104 initial_assessment_arm_1 2012-2013 0 1 0 0 1 4 4 8 ... 12 Language 0101 83 PLS receptive 2012 Bilateral HA 0 True
0101-2013-0112 initial_assessment_arm_1 2012-2013 0 1 0 0 1 3 6 9 ... 11 Language 0101 90 PLS receptive 2012 Bilateral HA 0 True
0101-2013-0113 initial_assessment_arm_1 2013-2014 0 0 0 0 1 2 2 8 ... 4 Language 0101 96 PLS receptive 2013 Bimodal 0 True
0101-2013-0114 initial_assessment_arm_1 2013-2014 0 1 0 0 0 3 3 8 ... 6 Language 0101 50 PLS receptive 2013 Bimodal 0 True
0101-2013-0115 initial_assessment_arm_1 2013-2014 0 1 0 0 2 2 2 8 ... 11 Language 0101 79 PLS receptive 2013 Bilateral HA 0 True
0101-2013-0116 initial_assessment_arm_1 2013-2014 2 0 0 0 3 4 4 8 ... NaN NaN NaN NaN NaN NaN 2013 Bimodal 1 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1151-2012-0008 initial_assessment_arm_1 NaN 0 0 2 1 2 4 2 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2012-0009 initial_assessment_arm_1 NaN 0 1 2 1 1 4 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 1 True
1151-2012-0010 initial_assessment_arm_1 NaN 0 1 2 1 1 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 2 True
1151-2012-0011 initial_assessment_arm_1 NaN 0 1 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 4 False
1151-2012-0012 initial_assessment_arm_1 NaN 0 1 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 5 False
1151-2012-0013 initial_assessment_arm_1 NaN 0 0 2 1 0 2 2 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 5 False
1151-2012-0014 initial_assessment_arm_1 NaN 0 0 2 1 2 1 1 NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False
1151-2013-0001 initial_assessment_arm_1 NaN 0 0 2 1 0 2 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 4 False
1151-2013-0002 initial_assessment_arm_1 NaN 0 0 2 1 2 2 3 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2013-0003 initial_assessment_arm_1 NaN 0 0 2 1 0 1 1 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2013-0004 initial_assessment_arm_1 NaN 0 0 2 1 2 0 1 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2013-0005 initial_assessment_arm_1 NaN 0 1 2 1 0 1 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2013-0006 initial_assessment_arm_1 NaN 0 0 2 1 0 1 1 4 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2013-0007 initial_assessment_arm_1 NaN 0 1 2 1 0 4 2 NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False
1151-2013-0008 initial_assessment_arm_1 NaN 0 1 2 1 0 3 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2013-0009 initial_assessment_arm_1 NaN 0 1 2 1 0 2 6 NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False
1151-2013-0010 initial_assessment_arm_1 NaN 0 1 2 1 0 4 4 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 6 False
1151-2013-0011 initial_assessment_arm_1 NaN 0 1 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 1 False
1151-2013-0012 initial_assessment_arm_1 NaN 0 1 2 1 2 3 3 5 ... NaN NaN NaN NaN NaN NaN nan Bimodal 3 False
1151-2014-0001 initial_assessment_arm_1 NaN 0 0 2 1 3 6 4 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 2 False
1151-2014-0002 initial_assessment_arm_1 NaN 0 1 2 1 1 2 4 7 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2014-0003 initial_assessment_arm_1 NaN 0 0 2 1 0 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2014-0004 initial_assessment_arm_1 NaN 0 1 2 1 1 2 1 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 3 False
1151-2014-0005 initial_assessment_arm_1 NaN 0 0 2 1 3 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 4 False
1151-2014-0006 initial_assessment_arm_1 NaN 0 1 2 1 0 4 0 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 3 False
1151-2014-0007 initial_assessment_arm_1 NaN 0 0 2 1 2 6 6 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 4 False
1151-2014-0008 initial_assessment_arm_1 NaN 0 0 2 1 1 1 1 8 ... NaN NaN NaN NaN NaN NaN nan Bimodal 5 False
1151-2014-0009 initial_assessment_arm_1 NaN 0 1 2 1 2 4 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral HA 6 True
1151-2014-0010 initial_assessment_arm_1 NaN 0 1 2 1 1 2 2 8 ... NaN NaN NaN NaN NaN NaN nan Bilateral CI 6 False
9308-2015-0002 initial_assessment_arm_1 NaN 0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN nan Bimodal NaN False

5511 rows × 76 columns

In [159]:
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');
In [160]:
disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)
Out[160]:
<matplotlib.axes._subplots.AxesSubplot at 0x1096d47b8>