# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Connect to database to import data for the three test domains and demographic information:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
metadata = lsl_dr_project.export_metadata()
# for i,j in zip(lsl_dr_project.field_names,
# lsl_dr_project.field_labels):
# print('{0}: \t{1}'.format(i,j))
Import each database from REDCap:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
records = lsl_dr_project.export_records(fields=articulation_fields)
print(records[0]['study_id'])
0101-2003-0101
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[888, 999, 9999]})
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13328 | 1147-2010-0064 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | ... | 3 | 6 | 65 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
13329 | 1147-2010-0064 | year_1_complete_71_arm_1 | 2011-2012 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3 | 5 | 77 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
13330 | 1147-2010-0064 | year_2_complete_71_arm_1 | 2012-2013 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3 | 5 | 89 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
13331 | 1147-2010-0064 | year_3_complete_71_arm_1 | 2013-2014 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 5 | 101 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
4 rows × 46 columns
Several fields in the demographic data have missing values.
demographic_raw.head()
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 2002-2003 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | ... | 2 | 2 | 54 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | 2003-2004 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 2004-2005 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 0101-2003-0101 | year_3_complete_71_arm_1 | 2005-2006 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 96 | 3 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 0101-2003-0101 | year_4_complete_71_arm_1 | 2006-2007 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 109 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 46 columns
We can fill missing values forward from previous observation (by study_id
)
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
Random check to make sure this worked
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13328 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 6 | 65 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
13329 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 77 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
13330 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 89 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
13331 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 101 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
4 rows × 46 columns
Demographic data without missing values:
demographic.head()
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9459 | initial_assessment_arm_1 | 2009-2010 | 0 | 0 | 0 | 0 | 0 | 5 | 4 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0735-2010-0017 |
9451 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 7 | 0 | 2 | 6 | 6 | 7 | ... | 2 | 17 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | 0735-2010-0015 |
9447 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 7 | 0 | 2 | 6 | 6 | 7 | ... | 2 | 17 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | 0735-2010-0014 |
9443 | initial_assessment_arm_1 | 2009-2010 | 0 | 1 | 7 | 0 | 1 | 6 | 6 | 8 | ... | 2 | 14 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 0735-2010-0013 |
5594 | initial_assessment_arm_1 | 2005-2006 | 0 | 1 | 0 | 0 | 3 | 2 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0521-2006-0025 |
5 rows × 46 columns
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 591 523 CELF-P2 1357 1363 OWLS 1058 1066 PLS 3349 3359 There are 0 null values for score
A school
variable was added, which is the first four columns of the study_id
:
language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
We converted the articulation dataset into a "long" format:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 5008 Arizonia 493 Arizonia and Goldman 73 dtype: int64 There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
articulation["school"] = articulation.study_id.str.slice(0,4)
The age was taken to be the Arizonia age if there are both test types:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 5571.000000 mean 68.695028 std 30.661547 min 23.000000 25% 47.000000 50% 60.000000 75% 80.000000 max 243.000000 Name: age_test, dtype: float64
Finally, we dropped unwanted columns and added a domain identification column for merging:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | Goldman | 78 | 0101 | 80 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | Goldman | 72 | 0101 | 44 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | Goldman | 97 | 0101 | 54 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | Goldman | 75 | 0101 | 53 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | Goldman | 80 | 0101 | 66 | Articulation |
We excluded unwanted columns and rows for which age, gender or race were missing:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})
Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False 11068 True 2450 dtype: int64 There are 694 null values for non_english
Mother's education (mother_ed
) and father's education (father_ed
) were both recoded to:
Category 6 (unknown) was recoded as missing.
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed: 6 4960 4 2876 3 1922 5 1527 2 1320 1 470 0 192 dtype: int64 mother_ed: 1 3242 2 2876 3 1527 0 662 dtype: int64 There are 5905 null values for mother_ed
Secondary diagnosis
demographic.shape
(14212, 48)
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
demographic.secondary_diagnosis.value_counts()
0 10371 1 2394 dtype: int64
demographic.secondary_diagnosis.mean()
0.18754406580493538
Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3410 null values for premature_weeks
demographic.premature_weeks.value_counts()
0 9214 2 554 4 358 12 193 6 178 10 148 8 112 14 42 16 3 dtype: int64
Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):
demographic.tech_ad.value_counts()
1 4813 0 4212 7 1462 5 970 2 474 6 413 8 69 9 57 3 27 4 25 dtype: int64
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None
demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
demographic.tech_left.value_counts()
2 6360 3 4272 0 1789 4 56 1 19 dtype: int64
demographic.tech_right.value_counts()
2 6282 3 4694 0 1462 4 57 1 27 dtype: int64
Substitute valid missing values for hearing loss:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None
Create degree_hl
, which is the maximum level of hearing loss in either ear:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)
Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):
demographic.columns
Index(['redcap_event_name', 'academic_year', 'hl', 'male', 'race', 'prim_lang', 'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp', 'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2', 'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad', 'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as', 'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled', 'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv', 'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses', 'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed', 'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left', 'degree_hl'], dtype='object')
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))
demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))
demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad: 0 4384 1 4 2 2 dtype: int64 There are 1639 null values for OAD hearing_aid: 2 2031 0 1593 1 737 dtype: int64 There are 1690 null values for hearing_aid cochlear: 0 2868 2 897 1 625 dtype: int64 There are 1639 null values for cochlear 14212
Identify bilateral and bimodal individuals:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
(3445, 5169, 1385, 2076)
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci',
'bilateral_ha',
'bimodal']].sum()
unilateral_ci 625 bilateral_ci 897 bilateral_ha 2031 bimodal 375 dtype: int64
Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0),
'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1),
'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2),
'implant_category'] = 8
demographic.implant_category.value_counts()
6 5169 3 3445 4 1385 1 895 0 672 8 14 2 12 7 5 5 1 dtype: int64
Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.
demographic.onset_1.unique()
array([ 1. , 18. , 17. , 22. , nan, 3. , 2. , 5. , 13. , 36. , 28. , 21. , 41. , 26. , 0. , 67. , 20. , 24. , 4. , 40. , 60. , 10. , 6. , 25. , 7. , 27. , 15. , 35. , 14. , 42. , 34. , 12. , 9. , 32. , 50. , 8. , 33. , 23. , 11. , 31. , 30. , 49. , 48. , 1.5, 19. , 2.5, 39. , 52. , 16. , 38. , 29. , 51. , 46. , 45. , 54. , 88. , 65. , 44. , 81. , 116. , 72. , 57. , 62. , 43. , 78. , 83. , 61. , 107. , 64. , 74. , 37. , 77. , 96. , 97. , 79. , 47. , 53. , 59. , 84. , 95. , 80. , 0.5, 58. , 56. , 86. , 98. , 85. , 75. , 119. , 66. , 70. , 63. , 140. , 126. , 133. , 103. , 87. , 76. , 55. , 68. , 92. , 71. , 154. , 89. , 152. ])
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0,
# 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1
Number of null values for age_diag
demographic.age_diag.isnull().sum()
3970
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
import seaborn as sb
unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()
# ag = sb.factorplot("sex", data=unique_students,
# palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()),
# 'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')
Child has another diagnosed disability
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)
Missing sibling counts were properly encoded as None
(missing).
demographic.loc[demographic.sib==4, 'sib'] = None
We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race: 0 7442 2 2374 1 1287 3 1006 6 691 8 519 7 240 4 64 5 28 dtype: int64 race: 0 7442 2 2374 4 1302 1 1287 3 1006 dtype: int64 There are 801 null values for race
Recode implant technology variables
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan,
# 'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
# 'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
# '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012',
'0000-0000': np.nan}).str.replace('*', '-').unique()
array(['2009-2010', '2010-2011', '2005-2006', nan, '2007-2008', '2006-2007', '2008-2009', '2003-2004', '2012-2013', '2013-2014', '2002-2003', '2011-2012', '2014-2015', 'June 2014', '2004-2005', '1997-1998', '2006-2007 ', '1998-1999', '2000-2001', '2001-2002', '2103-2014', '1999-2000', '2012', '2009-2011', '2015-2016', '1995-1996', '2014', '2013', '2011', '2010', '2009', ' 2010-2011', ' 2014-2015', '2012=2013', '2015', '2015-2015', '2009 - 2010', '2010 - 2011', '2011 - 2012', '2014-2105', '2014-2015 ', '65', '2005-2004', '2012 - 2013', '2014-205', '2013 - 2014', '2017-2015', '2014-1015', '2012-2013 '], dtype=object)
demographic['academic_year'] = demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
'2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
'2015-2015': '2014-2015', '2009-2011': '2009-2010',
'0000-0000': np.nan}).str.replace('*', '-')
Removed entries that don't contain dashes
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')),
'academic_year'] = np.nan
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
demographic.age_amp.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x114b379b0>
We converted the expressive vocabulary dataset to "long" format:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 3642 EOWPVT 2624 EOWPVT and EVT 147 dtype: int64
A school
variable was added, which is the first four columns of the study_id
:
expressive["school"] = expressive.study_id.str.slice(0,4)
The age was taken to be the EOWPVT age if there are both test types:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
Finally, we dropped unwanted columns and added a domain identification column for merging:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 58 | EOWPVT | 0101 | 54 | Expressive Vocabulary |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 84 | EOWPVT | 0101 | 80 | Expressive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 90 | EOWPVT | 0101 | 113 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 90 | EOWPVT | 0101 | 53 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 87 | EOWPVT | 0101 | 66 | Expressive Vocabulary |
We converted the receptive vocabulary data table to "long" format:
receptive.columns
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
receptive["school"] = receptive.study_id.str.slice(0,4)
The age was taken to be the PPVT age if there are both test types:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 27 null values for age_test
Finally, we dropped unwanted columns and added a domain identification column for merging:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 90 | PPVT | 0101 | 80 | Receptive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 101 | ROWPVT | 0101 | 113 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | PPVT | 0101 | 44 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 80 | PPVT | 0101 | 54 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 101 | PPVT | 0101 | 68 | Receptive Vocabulary |
receptive.study_id.unique().shape
(2959,)
The four datasets were mereged into a single table. First, we concatenate the test scores data:
test_scores = pd.concat([articulation, expressive, receptive, language])
Then we perform a merge between the demographic data and the test scores data:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
lsl_dr.tail()
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | sex | known_synd | synd_or_disab | race | age_test | domain | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
36577 | year_9_complete_71_arm_1 | 2012-2013 | 0 | 0 | 8 | 0 | 2 | 6 | 4 | 8 | ... | Female | 0 | 0 | 4 | 136 | Expressive Vocabulary | 1147 | 92 | NaN | EVT |
36578 | year_9_complete_71_arm_1 | 2012-2013 | 0 | 0 | 8 | 0 | 2 | 6 | 4 | 8 | ... | Female | 0 | 0 | 4 | 136 | Receptive Vocabulary | 1147 | 84 | NaN | PPVT |
36579 | year_9_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 3 | 6 | 6 | 9 | ... | Female | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
36580 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 2 | NaN | 6 | 8 | ... | Male | 0 | 0 | 0 | 185 | Expressive Vocabulary | 1147 | 125 | NaN | EVT |
36581 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 2 | NaN | 6 | 8 | ... | Male | 0 | 0 | 0 | 186 | Receptive Vocabulary | 1147 | 101 | NaN | PPVT |
5 rows × 73 columns
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
2013 6924 2012 6628 2014 5480 2011 5205 2010 4419 nan 3177 2009 2358 2008 825 2007 531 2006 343 2005 285 2004 173 2003 90 2002 47 2001 35 1998 16 1999 15 2000 12 2015 11 1997 6 1995 1 2103 1 dtype: int64
current_year_only = False
if current_year_only:
lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language')
& (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
Export dataset
if current_year_only:
lsl_dr.to_csv('lsl_dr_current_year.csv')
else:
lsl_dr.to_csv('lsl_dr.csv')
lsl_dr.shape
(36582, 74)
lsl_dr.study_id.unique().shape
(5440,)
demographic.study_id.unique().shape
(5440,)
Convert score to floating-point number
lsl_dr.score = lsl_dr.score.astype(float)
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
lsl_dr.domain.dropna().unique()
array(['Articulation', 'Expressive Vocabulary', 'Language', 'Receptive Vocabulary'], dtype=object)
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
tech_class Bilateral CI 0.43 Bilateral HA 0.58 Bimodal 0.51 Name: prim_lang, dtype: float64
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
tech_class Bilateral CI 0.08 Bilateral HA 0.86 Bimodal 0.30 Name: non_profound, dtype: float64
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
ax.set_ylim(40, 120)
ax.set_xticks(range(2,7))
ax.set_title(dom)
lsl_dr.pivot_table?
plot_color = "#64AAE8"
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None,
ylim=None, title=None, **kwargs):
ax = kwargs.get('ax')
if ax is None:
f, ax = plt.subplots()
counts = series.value_counts().sort_index(1)
counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
if xlim is None:
ax.set_xlim(-0.5, len(counts)-0.5)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_ylabel('Count')
if labels is not None:
ax.set_xticklabels(labels)
if title:
ax.set_title(title)
for i,x in enumerate(counts):
ax.annotate('%i' % x, (i, x + label_offset))
# plt.gca().tight_layout()
unique_students = demographic.drop_duplicates('study_id')
unique_students.shape
(5440, 67)
unique_students.age.describe()
count 4949.000000 mean 30.353708 std 28.072288 min 0.000000 25% 9.000000 50% 25.000000 75% 41.000000 max 298.000000 Name: age, dtype: float64
plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)
plot_demo_data(unique_students.prim_lang,
('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'),
rot=70, color=plot_color)
unique_students.prim_lang.count()
4889
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'),
color=plot_color)
unique_students.sib.count()
4515
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months",
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years",
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]
demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4857 null values for age_amp
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
labels=amp_ages))
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
plt.annotate('%i' % x, (i, x + 10))
age_amp_counts.sum()
3316
unique_students.age_amp.max()
666.0
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
<matplotlib.text.Text at 0x10f927e80>
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90,
ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90,
ax=axes[1], title='Left ear', color=plot_color)
unique_students.tech_right.count()
4361
unique_students.tech_left.count()
4352
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90,
color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90,
color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
unique_students.degree_hl_as.count()
4265
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
unique_students.type_hl_ad.count()
4201
unique_students.type_hl_as.count()
4286
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90,
title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90,
title='Left ear', ax=axes[1], color=plot_color)
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | bilateral_ci | bilateral_ha | bimodal | tech | implant_category | age_diag | sex | known_synd | synd_or_disab | race | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13328 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
13329 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
13330 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
13331 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
4 rows × 67 columns
receptive[receptive.study_id=='1147-2010-0064']
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
13328 | 1147-2010-0064 | initial_assessment_arm_1 | 96 | PPVT | 1147 | 63 | Receptive Vocabulary |
13329 | 1147-2010-0064 | year_1_complete_71_arm_1 | 91 | PPVT | 1147 | 73 | Receptive Vocabulary |
13330 | 1147-2010-0064 | year_2_complete_71_arm_1 | 93 | PPVT | 1147 | 85 | Receptive Vocabulary |
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | age_test | domain | school | score | test_name | test_type | academic_year_start | tech_class | age_year | non_profound | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8588 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 63 | Expressive Vocabulary | 1147 | 91 | NaN | EVT | 2010 | Bilateral HA | 4 | True |
8589 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 63 | Receptive Vocabulary | 1147 | 96 | NaN | PPVT | 2010 | Bilateral HA | 4 | True |
8590 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 59 | Language | 1147 | 101 | PLS | receptive | 2010 | Bilateral HA | 4 | True |
8591 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 59 | Language | 1147 | 87 | PLS | expressive | 2010 | Bilateral HA | 4 | True |
19250 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 72 | Expressive Vocabulary | 1147 | 86 | NaN | EVT | 2011 | Bilateral HA | 4 | True |
19251 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 73 | Receptive Vocabulary | 1147 | 91 | NaN | PPVT | 2011 | Bilateral HA | 4 | True |
20546 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 88 | Expressive Vocabulary | 1147 | 95 | NaN | EVT | 2012 | Bilateral HA | 4 | True |
20547 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 85 | Receptive Vocabulary | 1147 | 93 | NaN | PPVT | 2012 | Bilateral HA | 4 | True |
29121 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2013 | Bilateral HA | 4 | True |
9 rows × 77 columns
unique_students.type_hl_ad.count()
4201
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
(2959,)
demographic.study_id.unique().shape
(5440,)
receptive.study_id.unique().shape
(2959,)
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
(2959,)
receptive_ids = receptive.study_id.unique()
demographic_ids = demographic.study_id.unique()
[s for s in receptive_ids if s not in demographic_ids]
[]
def score_summary(domain, test_type=None):
subset = lsl_dr[lsl_dr.domain==domain].copy()
if test_type is not None:
subset = subset[subset.test_type==test_type]
subset['age_test'] = (subset.age_test/12).dropna().astype(int)
subset.loc[subset.age_test > 11, 'age_test'] = 11
subset = subset[subset.age_test>1]
byage = subset.groupby('age_test')
n = byage.study_id.count()
mean = byage.score.mean()
sd = byage.score.std()
min = byage.score.min()
max = byage.score.max()
summary = pd.DataFrame({'Sample Size':n, 'Mean':mean,
'SD':sd, 'Min':min, 'Max':max})
summary.index = summary.index.values.astype(int)
return summary[['Sample Size','Mean','SD','Min','Max']]
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 395 | 93.101266 | 18.115391 | 40 | 144 |
3 | 1363 | 91.903155 | 19.384986 | 0 | 150 |
4 | 1497 | 90.636607 | 20.373490 | 0 | 149 |
5 | 1122 | 89.802139 | 18.191470 | 0 | 142 |
6 | 620 | 85.562903 | 16.478926 | 40 | 154 |
7 | 411 | 82.951338 | 15.993827 | 40 | 130 |
8 | 292 | 80.458904 | 17.713500 | 20 | 132 |
9 | 216 | 77.805556 | 17.770086 | 25 | 160 |
10 | 185 | 75.875676 | 17.296981 | 20 | 123 |
11 | 449 | 78.632517 | 19.118076 | 20 | 134 |
receptive_summary.describe()
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.0000 | 10.000000 |
mean | 655.000000 | 84.673006 | 18.043673 | 20.5000 | 141.800000 |
std | 488.137503 | 6.392185 | 1.324983 | 16.4063 | 11.802071 |
min | 185.000000 | 75.875676 | 15.993827 | 0.0000 | 123.000000 |
25% | 317.750000 | 79.089114 | 17.401111 | 5.0000 | 132.500000 |
50% | 430.000000 | 84.257121 | 17.942739 | 20.0000 | 143.000000 |
75% | 996.500000 | 90.427990 | 18.886424 | 36.2500 | 149.750000 |
max | 1497.000000 | 93.101266 | 20.373490 | 40.0000 | 160.000000 |
receptive_summary['Sample Size'].sum()
6550
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
<matplotlib.text.Text at 0x10f8e2a58>
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 375 | 92.362667 | 22.025792 | 23 | 141 |
3 | 1310 | 93.083206 | 21.785851 | 0 | 145 |
4 | 1473 | 92.156144 | 21.912841 | 0 | 146 |
5 | 1097 | 91.375570 | 20.134114 | 0 | 145 |
6 | 618 | 86.344660 | 18.457169 | 20 | 146 |
7 | 414 | 83.814010 | 15.712203 | 38 | 131 |
8 | 283 | 83.893993 | 16.552176 | 34 | 122 |
9 | 202 | 81.321782 | 16.143647 | 36 | 145 |
10 | 179 | 81.564246 | 15.327513 | 40 | 122 |
11 | 450 | 84.935556 | 17.521297 | 18 | 146 |
expressive_summary['Sample Size'].sum()
6401
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
plt.ylim(0, 400)
else:
plt.ylim(0, 1400)
articulation_summary = score_summary("Articulation")
articulation_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 282 | 85.124113 | 15.168643 | 50 | 122 |
3 | 1131 | 83.458002 | 18.272973 | 40 | 126 |
4 | 1309 | 83.409473 | 20.701802 | 0 | 121 |
5 | 1026 | 83.880117 | 35.350375 | 39 | 999 |
6 | 583 | 78.895369 | 21.792075 | 39 | 112 |
7 | 389 | 80.095116 | 51.718446 | 3 | 999 |
8 | 244 | 78.959016 | 21.168498 | 40 | 107 |
9 | 169 | 81.118343 | 20.524903 | 40 | 108 |
10 | 131 | 80.954198 | 20.111177 | 40 | 105 |
11 | 305 | 81.947541 | 19.283989 | 39 | 105 |
articulation_summary['Sample Size'].sum()
5569
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);
Language scores
lsl_dr.domain.unique()
array([nan, 'Articulation', 'Expressive Vocabulary', 'Language', 'Receptive Vocabulary'], dtype=object)
lsl_dr.test_type.unique()
array([nan, 'Goldman', 'EVT', 'receptive', 'expressive', 'EOWPVT', 'ROWPVT', 'PPVT', 'EOWPVT and EVT', 'Arizonia', 'Arizonia and Goldman', 'PPVT and ROWPVT'], dtype=object)
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 932 | 85.894850 | 21.996331 | 50 | 150 |
3 | 1315 | 84.855513 | 19.730693 | 50 | 144 |
4 | 1301 | 85.104535 | 19.598740 | 43 | 145 |
5 | 930 | 83.733333 | 18.762635 | 47 | 140 |
6 | 479 | 77.805846 | 17.642483 | 11 | 127 |
7 | 318 | 75.877358 | 18.713363 | 40 | 123 |
8 | 195 | 74.641026 | 19.685003 | 40 | 123 |
9 | 52 | 69.846154 | 20.649631 | 40 | 109 |
10 | 42 | 77.000000 | 20.167591 | 40 | 119 |
11 | 68 | 75.602941 | 21.490810 | 40 | 139 |
receptive_language_summary['Sample Size'].sum()
5632
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 926 | 88.035637 | 18.268384 | 50 | 150 |
3 | 1316 | 82.285714 | 17.541316 | 20 | 147 |
4 | 1293 | 80.304718 | 19.572599 | 45 | 141 |
5 | 948 | 78.530591 | 20.053469 | 45 | 144 |
6 | 496 | 71.512097 | 19.214889 | 6 | 140 |
7 | 338 | 66.789941 | 20.660322 | 40 | 124 |
8 | 200 | 67.520000 | 21.275168 | 40 | 118 |
9 | 51 | 64.725490 | 20.570929 | 40 | 106 |
10 | 42 | 74.595238 | 23.463578 | 40 | 119 |
11 | 67 | 73.417910 | 22.766369 | 40 | 132 |
expressive_language_summary['Sample Size'].sum()
5677
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')
<matplotlib.text.Text at 0x10f933d30>
(unique_students.age/12.).describe()
count 4949.000000 mean 2.529476 std 2.339357 min 0.000000 25% 0.750000 50% 2.083333 75% 3.416667 max 24.833333 Name: age, dtype: float64
def calc_difference(x, col='a_fo', jitter=True):
if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
return None
diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
if jitter:
diff += np.random.normal(scale=0.05)
if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
print(x['funct_out_age'])
return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
<matplotlib.text.Text at 0x10f8dd5c0>
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
<matplotlib.text.Text at 0x110ea1da0>
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
<matplotlib.text.Text at 0x10f94ab00>
lsl_dr.degree_hl.dropna().value_counts()
6 16633 4 4334 3 4159 5 4058 2 1653 0 1270 1 281 dtype: int64
ax = lsl_dr.degree_hl.hist(bins=7)
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x10e5722b0>
(lsl_dr.age_int<6).mean()
0.19320977529932754
(lsl_dr.age<6).mean()
0.12798644141927723
Counts by year
lsl_dr.groupby('study_id').first()
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | age_test | domain | school | score | test_name | test_type | academic_year_start | tech_class | age_year | non_profound | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
study_id | |||||||||||||||||||||
00624-2010-0049 | initial_assessment_arm_1 | 2013-2014 | 0 | 1 | 2 | 0 | NaN | 6 | 6 | 9 | ... | 24 | Language | 0062 | 50 | PLS | receptive | 2013 | Bilateral CI | 2 | False |
0101-2003-0101 | initial_assessment_arm_1 | 2002-2003 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 9 | ... | 54 | Expressive Vocabulary | 0101 | 58 | PLS | EOWPVT | 2002 | Bimodal | 4 | False |
0101-2003-0102 | initial_assessment_arm_1 | 2003-2004 | 0 | 0 | 0 | 0 | 1 | 2 | 2 | 8 | ... | 44 | Articulation | 0101 | 72 | PLS | Goldman | 2003 | Bilateral HA | 3 | True |
0101-2004-0101 | initial_assessment_arm_1 | 2006-2007 | 0 | 1 | 0 | 0 | 0 | 6 | 6 | 8 | ... | 37 | Receptive Vocabulary | 0101 | 62 | PLS | PPVT | 2006 | Bimodal | 2 | True |
0101-2004-0102 | initial_assessment_arm_1 | 2004-2005 | 0 | 0 | 0 | 0 | 1 | 5 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2004 | Bimodal | 0 | True |
0101-2004-0103 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 1 | 4 | 4 | 8 | ... | 96 | Expressive Vocabulary | 0101 | 104 | CELF-4 | EVT | 2012 | Bilateral CI | 0 | False |
0101-2004-0104 | initial_assessment_arm_1 | 2004-2005 | 0 | 1 | 0 | 0 | 1 | 6 | 6 | 8 | ... | 32 | Articulation | 0101 | 84 | PLS | Goldman | 2004 | Bilateral HA | 0 | True |
0101-2004-0105 | initial_assessment_arm_1 | 2004-2005 | 0 | 0 | 0 | 0 | 2 | 6 | 6 | 9 | ... | 47 | Articulation | 0101 | 78 | CELF-P2 | Goldman | 2004 | Bimodal | 2 | False |
0101-2005-0101 | initial_assessment_arm_1 | 2006-2007 | 0 | 1 | 0 | 0 | 2 | 5 | 4 | 8 | ... | 28 | Articulation | 0101 | 61 | PLS | Goldman | 2006 | Bilateral HA | 2 | True |
0101-2005-0102 | initial_assessment_arm_1 | 2004-2005 | 0 | 1 | 0 | 0 | 2 | 3 | 2 | 9 | ... | 63 | Articulation | 0101 | 87 | CELF-P2 | Goldman | 2004 | Bilateral HA | 4 | True |
0101-2006-0101 | initial_assessment_arm_1 | 2005-2006 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2005 | Bimodal | 0 | False |
0101-2006-0104 | initial_assessment_arm_1 | 2006-2007 | 0 | 0 | 0 | 0 | 0 | 5 | 5 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2006 | Bilateral CI | 2 | False |
0101-2007-0104 | initial_assessment_arm_1 | 2007-2008 | 0 | 0 | 0 | 0 | 1 | 4 | 6 | 9 | ... | 41 | Articulation | 0101 | 122 | NaN | Goldman | 2007 | Bimodal | 4 | True |
0101-2007-0105 | initial_assessment_arm_1 | 2007-2008 | 0 | 1 | 0 | 0 | 0 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2007 | Bimodal | 11 | False |
0101-2007-0107 | initial_assessment_arm_1 | 2005-2006 | 0 | 0 | 0 | 0 | 1 | 4 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2005 | Bilateral HA | 0 | True |
0101-2008-0102 | initial_assessment_arm_1 | 2008-2009 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2008 | Bimodal | 14 | False |
0101-2008-0106 | initial_assessment_arm_1 | 2007-2008 | 0 | 0 | 0 | 0 | 1 | 4 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2007 | Bilateral HA | 0 | True |
0101-2009-0101 | initial_assessment_arm_1 | 2008-2009 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2008 | Bimodal | 6 | False |
0101-2010-0101 | initial_assessment_arm_1 | 2008-2009 | 0 | 1 | 0 | 0 | 1 | 6 | 6 | 9 | ... | 104 | Articulation | 0101 | 90 | CELF-4 | Arizonia | 2008 | Bilateral HA | 8 | True |
0101-2010-0103 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 2 | 4 | 3 | 8 | ... | 25 | Language | 0101 | 63 | PLS | receptive | 2010 | Bilateral HA | 0 | False |
0101-2010-0104 | initial_assessment_arm_1 | 2010-2011 | 0 | 1 | 3 | 0 | 1 | 2 | 2 | 8 | ... | 30 | Expressive Vocabulary | 0101 | 90 | PLS | EOWPVT | 2010 | Bilateral HA | 0 | False |
0101-2010-0105 | initial_assessment_arm_1 | 2011-2012 | 0 | 1 | 0 | 0 | 0 | 5 | 6 | 6 | ... | 30 | Language | 0101 | 66 | PLS | receptive | 2011 | Bilateral CI | 2 | False |
0101-2012-0101 | initial_assessment_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 0 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2013 | Bimodal | 2 | False |
0101-2013-0101 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 0 | 3 | 2 | 8 | ... | 12 | Language | 0101 | 58 | PLS | receptive | 2012 | Bilateral HA | 0 | True |
0101-2013-0103 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 2 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2012 | Bilateral CI | 4 | False |
0101-2013-0104 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 1 | 4 | 4 | 8 | ... | 12 | Language | 0101 | 83 | PLS | receptive | 2012 | Bilateral HA | 0 | True |
0101-2013-0112 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 1 | 3 | 6 | 9 | ... | 11 | Language | 0101 | 90 | PLS | receptive | 2012 | Bilateral HA | 0 | True |
0101-2013-0113 | initial_assessment_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 2 | 2 | 8 | ... | 4 | Language | 0101 | 96 | PLS | receptive | 2013 | Bimodal | 0 | True |
0101-2013-0114 | initial_assessment_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 0 | 3 | 3 | 8 | ... | 6 | Language | 0101 | 50 | PLS | receptive | 2013 | Bimodal | 0 | True |
0101-2013-0115 | initial_assessment_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 2 | 2 | 2 | 8 | ... | 11 | Language | 0101 | 79 | PLS | receptive | 2013 | Bilateral HA | 0 | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1151-2012-0008 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 4 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2012-0009 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 4 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 1 | True |
1151-2012-0010 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 2 | True |
1151-2012-0011 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 4 | False |
1151-2012-0012 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 5 | False |
1151-2012-0013 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 2 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 5 | False |
1151-2012-0014 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 1 | 1 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
1151-2013-0001 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 2 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 4 | False |
1151-2013-0002 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 2 | 3 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2013-0003 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2013-0004 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 0 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2013-0005 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 1 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2013-0006 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 4 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2013-0007 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 4 | 2 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
1151-2013-0008 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 3 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2013-0009 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 2 | 6 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
1151-2013-0010 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 4 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 6 | False |
1151-2013-0011 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 1 | False |
1151-2013-0012 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 2 | 3 | 3 | 5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2014-0001 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 3 | 6 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2014-0002 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 2 | 4 | 7 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2014-0003 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2014-0004 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 2 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2014-0005 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 3 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 4 | False |
1151-2014-0006 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 4 | 0 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 3 | False |
1151-2014-0007 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 4 | False |
1151-2014-0008 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 1 | 1 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 5 | False |
1151-2014-0009 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 2 | 4 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 6 | True |
1151-2014-0010 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 2 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 6 | False |
9308-2015-0002 | initial_assessment_arm_1 | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
5440 rows × 76 columns
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');
disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10f9f9198>