# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Connect to database to import data for the three test domains and demographic information:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
metadata = lsl_dr_project.export_metadata()
# for i,j in zip(lsl_dr_project.field_names,
# lsl_dr_project.field_labels):
# print('{0}: \t{1}'.format(i,j))
Import each database from REDCap:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
records = lsl_dr_project.export_records(fields=articulation_fields)
print(records[0]['study_id'])
0101-2003-0101
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[888, 999, 9999]})
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13565 | 1147-2010-0064 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | ... | 3 | 6 | 65 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
13566 | 1147-2010-0064 | year_1_complete_71_arm_1 | 2011-2012 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3 | 5 | 77 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
13567 | 1147-2010-0064 | year_2_complete_71_arm_1 | 2012-2013 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3 | 5 | 89 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
13568 | 1147-2010-0064 | year_3_complete_71_arm_1 | 2013-2014 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 5 | 101 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
4 rows × 46 columns
Several fields in the demographic data have missing values.
demographic_raw.head()
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 2002-2003 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | ... | 2 | 2 | 54 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | 2003-2004 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 2004-2005 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 0101-2003-0101 | year_3_complete_71_arm_1 | 2005-2006 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 96 | 3 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 0101-2003-0101 | year_4_complete_71_arm_1 | 2006-2007 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 109 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 46 columns
We can fill missing values forward from previous observation (by study_id
)
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) if __name__ == '__main__': /Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) app.launch_new_instance()
Random check to make sure this worked
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13565 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 6 | 65 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
13566 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 77 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
13567 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 89 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
13568 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 101 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
4 rows × 46 columns
Demographic data without missing values:
demographic.head()
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | initial_assessment_arm_1 | 2002-2003 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 9 | ... | 2 | 54 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2003-0101 |
7486 | initial_assessment_arm_1 | 2013-2014 | 0 | 0 | 1 | 0 | 0 | 2 | 2 | 8 | ... | 1 | 7 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | 0626-2014-0035 |
7484 | initial_assessment_arm_1 | 2014-2015 | 0 | 1 | 6 | 0 | 1 | 4 | 4 | 8 | ... | 3 | 56 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | 0626-2014-0034 |
7483 | initial_assessment_arm_1 | 2014-2015 | 0 | 1 | 3 | 0 | 1 | 4 | 5 | 8 | ... | 0 | 29 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 0626-2014-0033 |
7482 | initial_assessment_arm_1 | 2014-2015 | 0 | 1 | 0 | 0 | 1 | 3 | 5 | 8 | ... | 1 | 11 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | 0626-2014-0032 |
5 rows × 46 columns
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 593 525 CELF-P2 1374 1379 OWLS 1065 1072 PLS 3387 3397 There are 0 null values for score
A school
variable was added, which is the first four columns of the study_id
:
language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
We converted the articulation dataset into a "long" format:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 5098 Arizonia 498 Arizonia and Goldman 73 Name: test_type, dtype: int64 There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
articulation["school"] = articulation.study_id.str.slice(0,4)
The age was taken to be the Arizonia age if there are both test types:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 5666.000000 mean 68.598835 std 30.694788 min 23.000000 25% 47.000000 50% 60.000000 75% 80.000000 max 243.000000 Name: age_test, dtype: float64
Finally, we dropped unwanted columns and added a domain identification column for merging:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | Goldman | 78 | 0101 | 80 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | Goldman | 72 | 0101 | 44 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | Goldman | 97 | 0101 | 54 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | Goldman | 75 | 0101 | 53 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | Goldman | 80 | 0101 | 66 | Articulation |
We excluded unwanted columns and rows for which age, gender or race were missing:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})
Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False 11198 True 2478 Name: non_english, dtype: int64 There are 691 null values for non_english
Mother's education (mother_ed
) and father's education (father_ed
) were both recoded to:
Category 6 (unknown) was recoded as missing.
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed: 6 5001 4 2921 3 1950 5 1545 2 1342 1 474 0 194 Name: _mother_ed, dtype: int64 mother_ed: 1 3292 2 2921 3 1545 0 668 Name: mother_ed, dtype: int64 There are 5941 null values for mother_ed
Secondary diagnosis
demographic.shape
(14367, 48)
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
demographic.secondary_diagnosis.value_counts()
0 10492 1 2416 Name: secondary_diagnosis, dtype: int64
demographic.secondary_diagnosis.mean()
0.18717074682367524
Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3437 null values for premature_weeks
demographic.premature_weeks.value_counts()
0 9331 2 560 4 356 12 195 6 181 10 149 8 113 14 42 16 3 Name: premature_weeks, dtype: int64
Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):
demographic.tech_ad.value_counts()
1 4853 0 4246 7 1475 5 988 2 481 6 414 8 71 9 58 3 27 4 25 10 2 Name: tech_ad, dtype: int64
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None
demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
demographic.tech_left.value_counts()
2 6423 3 4309 0 1802 4 57 1 19 Name: tech_left, dtype: int64
demographic.tech_right.value_counts()
2 6349 3 4731 0 1475 4 58 1 27 Name: tech_right, dtype: int64
Substitute valid missing values for hearing loss:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None
Create degree_hl
, which is the maximum level of hearing loss in either ear:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)
Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):
demographic.columns
Index(['redcap_event_name', 'academic_year', 'hl', 'male', 'race', 'prim_lang', 'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp', 'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2', 'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad', 'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as', 'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled', 'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv', 'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses', 'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed', 'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left', 'degree_hl'], dtype='object')
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))
demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))
demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad: 0 4417 1 4 2 2 Name: oad, dtype: int64 There are 1674 null values for OAD hearing_aid: 2 2048 0 1604 1 741 Name: hearing_aid, dtype: int64 There are 1727 null values for hearing_aid cochlear: 0 2894 2 903 1 626 Name: cochlear, dtype: int64 There are 1674 null values for cochlear 14367
Identify bilateral and bimodal individuals:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
(3479, 5224, 1387, 2082)
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci',
'bilateral_ha',
'bimodal']].sum()
unilateral_ci 626 bilateral_ci 903 bilateral_ha 2048 bimodal 375 dtype: int64
Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0),
'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1),
'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2),
'implant_category'] = 8
demographic.implant_category.value_counts()
6 5224 3 3479 4 1387 1 911 0 676 8 14 2 12 7 5 5 1 Name: implant_category, dtype: int64
Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.
demographic.onset_1.unique()
array([ 15. , 1. , 37. , 10. , 0. , 19. , 6. , 33. , 26. , 2. , 60. , 16. , 50. , 39. , 28. , 17. , 4. , 18. , nan, 3. , 35. , 38. , 95. , 42. , 7. , 13. , 12. , 31. , 14. , 27. , 11. , 36. , 41. , 22. , 24. , 51. , 84. , 61. , 5. , 30. , 88. , 46. , 23. , 80. , 9. , 8. , 83. , 74. , 25. , 64. , 107. , 21. , 72. , 116. , 40. , 57. , 78. , 65. , 43. , 47. , 79. , 34. , 62. , 77. , 48. , 96. , 52. , 97. , 67. , 20. , 45. , 29. , 59. , 53. , 1.5, 81. , 55. , 54. , 49. , 70. , 58. , 44. , 32. , 71. , 63. , 140. , 66. , 87. , 76. , 68. , 92. , 86. , 126. , 85. , 133. , 103. , 56. , 119. , 2.5, 98. , 75. , 0.5, 152. , 89. , 154. ])
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0,
# 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1
Number of null values for age_diag
demographic.age_diag.isnull().sum()
3994
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
import seaborn as sb
unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()
# ag = sb.factorplot("sex", data=unique_students,
# palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()),
# 'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')
Child has another diagnosed disability
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)
Missing sibling counts were properly encoded as None
(missing).
demographic.loc[demographic.sib==4, 'sib'] = None
We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race: 0 7531 2 2412 1 1300 3 1011 6 698 8 521 7 242 4 65 5 28 Name: _race, dtype: int64 race: 0 7531 2 2412 4 1312 1 1300 3 1011 Name: race, dtype: int64 There are 801 null values for race
Recode implant technology variables
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan,
# 'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
# 'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
# '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012',
'0000-0000': np.nan}).str.replace('*', '-').unique()
array(['2002-2003', '2013-2014', '2014-2015', '2012-2013', '2011-2012', '2009-2010', '2010-2011', '2007-2008', '2008-2009', nan, '2009-2011', '2006-2007', '2005-2006', '2012', '2006-2007 ', '2004-2005', '2003-2004', '2015-2016', '2015', '2014', '2001-2002', '2000-2001', '1995-1996', '1998-1999', '1999-2000', '1997-1998', '2013', '2010', '2009', '2011', ' 2010-2011', '2015-2015', '2014-2015 ', '2012-2013 '], dtype=object)
demographic['academic_year'] = demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
'2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
'2015-2015': '2014-2015', '2009-2011': '2009-2010',
'0000-0000': np.nan}).str.replace('*', '-')
Removed entries that don't contain dashes
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')),
'academic_year'] = np.nan
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
demographic.age_amp.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10fd84cc0>
We converted the expressive vocabulary dataset to "long" format:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 3691 EOWPVT 2657 EOWPVT and EVT 147 Name: test_type, dtype: int64
A school
variable was added, which is the first four columns of the study_id
:
expressive["school"] = expressive.study_id.str.slice(0,4)
The age was taken to be the EOWPVT age if there are both test types:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
Finally, we dropped unwanted columns and added a domain identification column for merging:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 58 | EOWPVT | 0101 | 54 | Expressive Vocabulary |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 84 | EOWPVT | 0101 | 80 | Expressive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 90 | EOWPVT | 0101 | 113 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 90 | EOWPVT | 0101 | 53 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 87 | EOWPVT | 0101 | 66 | Expressive Vocabulary |
We converted the receptive vocabulary data table to "long" format:
receptive.columns
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
receptive["school"] = receptive.study_id.str.slice(0,4)
The age was taken to be the PPVT age if there are both test types:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 28 null values for age_test
Finally, we dropped unwanted columns and added a domain identification column for merging:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 90 | PPVT | 0101 | 80 | Receptive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 101 | ROWPVT | 0101 | 113 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | PPVT | 0101 | 44 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 80 | PPVT | 0101 | 54 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 101 | PPVT | 0101 | 68 | Receptive Vocabulary |
receptive.study_id.unique().shape
(3021,)
The four datasets were mereged into a single table. First, we concatenate the test scores data:
test_scores = pd.concat([articulation, expressive, receptive, language])
Then we perform a merge between the demographic data and the test scores data:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
lsl_dr.tail()
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | sex | known_synd | synd_or_disab | race | age_test | domain | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
36993 | year_9_complete_71_arm_1 | 2011-2012 | 0 | 1 | 0 | 0 | 3 | 6 | 6 | 8 | ... | Male | 0 | 0 | 0 | 162 | Receptive Vocabulary | 0102 | 84 | NaN | ROWPVT |
36994 | year_9_complete_71_arm_1 | NaN | 0 | 0 | 0 | 0 | NaN | 6 | 6 | 9 | ... | Female | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
36995 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 3 | 0 | 1 | 5 | 5 | 8 | ... | Male | 0 | 1 | 3 | 123 | Articulation | 1147 | 102 | NaN | Goldman |
36996 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 3 | 0 | 1 | 5 | 5 | 8 | ... | Male | 0 | 1 | 3 | 125 | Expressive Vocabulary | 1147 | 102 | NaN | EVT |
36997 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 3 | 0 | 1 | 5 | 5 | 8 | ... | Male | 0 | 1 | 3 | 123 | Receptive Vocabulary | 1147 | 95 | NaN | PPVT |
5 rows × 73 columns
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
2013 6928 2012 6630 2014 5560 2011 5216 2010 4425 nan 3157 2009 2362 2008 830 2007 531 2006 343 2015 336 2005 286 2004 172 2003 90 2002 47 2001 35 1998 16 1999 15 2000 12 1997 6 1995 1 Name: academic_year_start, dtype: int64
current_year_only = False
if current_year_only:
lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language')
& (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
Export dataset
if current_year_only:
lsl_dr.to_csv('lsl_dr_current_year.csv')
else:
lsl_dr.to_csv('lsl_dr.csv')
lsl_dr.shape
(36998, 74)
lsl_dr.study_id.unique().shape
(5511,)
demographic.study_id.unique().shape
(5511,)
Convert score to floating-point number
lsl_dr.score = lsl_dr.score.astype(float)
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
lsl_dr.domain.dropna().unique()
array(['Expressive Vocabulary', 'Language', 'Articulation', 'Receptive Vocabulary'], dtype=object)
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
tech_class Bilateral CI 0.43 Bilateral HA 0.59 Bimodal 0.50 Name: prim_lang, dtype: float64
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
tech_class Bilateral CI 0.08 Bilateral HA 0.86 Bimodal 0.30 Name: non_profound, dtype: float64
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
ax.set_ylim(40, 120)
ax.set_xticks(range(2,7))
ax.set_title(dom)
lsl_dr.pivot_table?
plot_color = "#64AAE8"
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None,
ylim=None, title=None, **kwargs):
ax = kwargs.get('ax')
if ax is None:
f, ax = plt.subplots()
counts = series.value_counts().sort_index()
counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
if xlim is None:
ax.set_xlim(-0.5, len(counts)-0.5)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_ylabel('Count')
if labels is not None:
ax.set_xticklabels(labels)
if title:
ax.set_title(title)
for i,x in enumerate(counts):
ax.annotate('%i' % x, (i, x + label_offset))
# plt.gca().tight_layout()
unique_students = demographic.drop_duplicates('study_id')
unique_students.shape
(5511, 67)
unique_students.age.describe()
count 5025.000000 mean 30.382886 std 27.944080 min 0.000000 25% 9.000000 50% 25.000000 75% 41.000000 max 298.000000 Name: age, dtype: float64
plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2800), color=plot_color)
plot_demo_data(unique_students.prim_lang,
('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'),
rot=70, color=plot_color)
unique_students.prim_lang.count()
4964
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'),
color=plot_color)
unique_students.sib.count()
4587
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months",
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years",
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]
demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4883 null values for age_amp
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
labels=amp_ages))
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) if __name__ == '__main__':
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
plt.annotate('%i' % x, (i, x + 10))
age_amp_counts.sum()
3355
unique_students.age_amp.max()
666.0
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
<matplotlib.text.Text at 0x109c03518>
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90,
ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90,
ax=axes[1], title='Left ear', color=plot_color)
unique_students.tech_right.count()
4393
unique_students.tech_left.count()
4384
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90,
color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90,
color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
unique_students.degree_hl_as.count()
4298
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
unique_students.type_hl_ad.count()
4236
unique_students.type_hl_as.count()
4320
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90,
title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90,
title='Left ear', ax=axes[1], color=plot_color)
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | bilateral_ci | bilateral_ha | bimodal | tech | implant_category | age_diag | sex | known_synd | synd_or_disab | race | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13565 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
13566 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
13567 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
13568 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | False | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 |
4 rows × 67 columns
receptive[receptive.study_id=='1147-2010-0064']
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
13565 | 1147-2010-0064 | initial_assessment_arm_1 | 96 | PPVT | 1147 | 63 | Receptive Vocabulary |
13566 | 1147-2010-0064 | year_1_complete_71_arm_1 | 91 | PPVT | 1147 | 73 | Receptive Vocabulary |
13567 | 1147-2010-0064 | year_2_complete_71_arm_1 | 93 | PPVT | 1147 | 85 | Receptive Vocabulary |
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | age_test | domain | school | score | test_name | test_type | academic_year_start | tech_class | age_year | non_profound | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5777 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 63 | Expressive Vocabulary | 1147 | 91 | NaN | EVT | 2010 | Bilateral HA | 4 | True |
5778 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 63 | Receptive Vocabulary | 1147 | 96 | NaN | PPVT | 2010 | Bilateral HA | 4 | True |
5779 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 59 | Language | 1147 | 101 | PLS | receptive | 2010 | Bilateral HA | 4 | True |
5780 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 59 | Language | 1147 | 87 | PLS | expressive | 2010 | Bilateral HA | 4 | True |
13901 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 72 | Expressive Vocabulary | 1147 | 86 | NaN | EVT | 2011 | Bilateral HA | 4 | True |
13902 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 73 | Receptive Vocabulary | 1147 | 91 | NaN | PPVT | 2011 | Bilateral HA | 4 | True |
21515 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 88 | Expressive Vocabulary | 1147 | 95 | NaN | EVT | 2012 | Bilateral HA | 4 | True |
21516 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 85 | Receptive Vocabulary | 1147 | 93 | NaN | PPVT | 2012 | Bilateral HA | 4 | True |
27748 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2013 | Bilateral HA | 4 | True |
9 rows × 77 columns
unique_students.type_hl_ad.count()
4236
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
(3021,)
demographic.study_id.unique().shape
(5511,)
receptive.study_id.unique().shape
(3021,)
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
(3021,)
receptive_ids = receptive.study_id.unique()
demographic_ids = demographic.study_id.unique()
[s for s in receptive_ids if s not in demographic_ids]
[]
def score_summary(domain, test_type=None):
subset = lsl_dr[lsl_dr.domain==domain].copy()
if test_type is not None:
subset = subset[subset.test_type==test_type]
subset['age_test'] = (subset.age_test/12).dropna().astype(int)
subset.loc[subset.age_test > 11, 'age_test'] = 11
subset = subset[subset.age_test>1]
byage = subset.groupby('age_test')
n = byage.study_id.count()
mean = byage.score.mean()
sd = byage.score.std()
min = byage.score.min()
max = byage.score.max()
summary = pd.DataFrame({'Sample Size':n, 'Mean':mean,
'SD':sd, 'Min':min, 'Max':max})
summary.index = summary.index.values.astype(int)
return summary[['Sample Size','Mean','SD','Min','Max']]
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 403 | 93.265509 | 18.062536 | 40 | 144 |
3 | 1397 | 92.099499 | 19.389263 | 0 | 150 |
4 | 1515 | 90.675248 | 20.314116 | 0 | 149 |
5 | 1128 | 89.859043 | 18.185493 | 0 | 142 |
6 | 624 | 85.669872 | 16.504194 | 40 | 154 |
7 | 413 | 83.053269 | 16.021892 | 40 | 130 |
8 | 295 | 80.610169 | 17.686631 | 20 | 132 |
9 | 218 | 77.885321 | 17.816542 | 25 | 160 |
10 | 188 | 76.303191 | 17.550191 | 20 | 123 |
11 | 450 | 78.668889 | 19.112355 | 20 | 134 |
receptive_summary.describe()
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.0000 | 10.000000 |
mean | 663.100000 | 84.809001 | 18.064321 | 20.5000 | 141.800000 |
std | 496.314069 | 6.359775 | 1.291166 | 16.4063 | 11.802071 |
min | 188.000000 | 76.303191 | 16.021892 | 0.0000 | 123.000000 |
25% | 322.000000 | 79.154209 | 17.584301 | 5.0000 | 132.500000 |
50% | 431.500000 | 84.361570 | 17.939539 | 20.0000 | 143.000000 |
75% | 1002.000000 | 90.471196 | 18.880639 | 36.2500 | 149.750000 |
max | 1515.000000 | 93.265509 | 20.314116 | 40.0000 | 160.000000 |
receptive_summary['Sample Size'].sum()
6631
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
<matplotlib.text.Text at 0x109bac2b0>
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 383 | 92.488251 | 21.829641 | 23 | 141 |
3 | 1343 | 93.293373 | 21.700653 | 0 | 145 |
4 | 1492 | 92.269437 | 21.873205 | 0 | 146 |
5 | 1103 | 91.451496 | 20.127388 | 0 | 145 |
6 | 623 | 86.491172 | 18.464257 | 20 | 146 |
7 | 416 | 83.899038 | 15.723956 | 38 | 131 |
8 | 286 | 84.006993 | 16.518993 | 34 | 122 |
9 | 204 | 81.431373 | 16.195243 | 36 | 145 |
10 | 182 | 81.758242 | 15.388049 | 40 | 122 |
11 | 451 | 84.944568 | 17.502864 | 18 | 146 |
expressive_summary['Sample Size'].sum()
6483
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
plt.ylim(0, 400)
else:
plt.ylim(0, 1600)
articulation_summary = score_summary("Articulation")
articulation_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 288 | 85.180556 | 15.086812 | 50 | 122 |
3 | 1167 | 83.655527 | 18.397543 | 40 | 126 |
4 | 1333 | 83.588147 | 20.725702 | 0 | 123 |
5 | 1032 | 83.908915 | 35.255688 | 39 | 999 |
6 | 589 | 79.049236 | 21.785893 | 39 | 112 |
7 | 391 | 80.191816 | 51.611731 | 3 | 999 |
8 | 248 | 79.084677 | 21.061047 | 40 | 107 |
9 | 172 | 81.412791 | 20.488435 | 40 | 108 |
10 | 134 | 81.052239 | 19.973786 | 40 | 105 |
11 | 310 | 84.835484 | 55.537870 | 39 | 999 |
articulation_summary['Sample Size'].sum()
5664
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);
Language scores
lsl_dr.domain.unique()
array(['Expressive Vocabulary', 'Language', nan, 'Articulation', 'Receptive Vocabulary'], dtype=object)
lsl_dr.test_type.unique()
array(['EOWPVT', 'receptive', 'expressive', nan, 'Goldman', 'EVT', 'PPVT', 'Arizonia', 'ROWPVT', 'Arizonia and Goldman', 'EOWPVT and EVT', 'PPVT and ROWPVT'], dtype=object)
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 942 | 86.061571 | 22.053419 | 50 | 150 |
3 | 1336 | 84.869760 | 19.694166 | 50 | 144 |
4 | 1310 | 85.103817 | 19.572003 | 43 | 145 |
5 | 934 | 83.780514 | 18.783587 | 47 | 140 |
6 | 481 | 77.860707 | 17.628083 | 11 | 127 |
7 | 318 | 75.877358 | 18.713363 | 40 | 123 |
8 | 197 | 74.817259 | 19.682871 | 40 | 123 |
9 | 53 | 70.792453 | 21.579333 | 40 | 120 |
10 | 44 | 77.954545 | 20.185137 | 40 | 119 |
11 | 69 | 76.014493 | 21.604393 | 40 | 139 |
receptive_language_summary['Sample Size'].sum()
5684
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 936 | 88.157051 | 18.278298 | 50 | 150 |
3 | 1337 | 82.311892 | 17.566191 | 20 | 147 |
4 | 1303 | 80.346124 | 19.558155 | 45 | 141 |
5 | 952 | 78.564076 | 20.089026 | 45 | 144 |
6 | 499 | 71.647295 | 19.240286 | 6 | 140 |
7 | 338 | 66.789941 | 20.660322 | 40 | 124 |
8 | 202 | 67.787129 | 21.338290 | 40 | 118 |
9 | 52 | 65.557692 | 21.233911 | 40 | 108 |
10 | 44 | 75.750000 | 23.544243 | 40 | 119 |
11 | 68 | 73.794118 | 22.807801 | 40 | 132 |
expressive_language_summary['Sample Size'].sum()
5731
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
(unique_students.age/12.).describe()
count 5025.000000 mean 2.531907 std 2.328673 min 0.000000 25% 0.750000 50% 2.083333 75% 3.416667 max 24.833333 Name: age, dtype: float64
def calc_difference(x, col='a_fo', jitter=True):
if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
return None
diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
if jitter:
diff += np.random.normal(scale=0.05)
if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
print(x['funct_out_age'])
return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
<matplotlib.text.Text at 0x109bee080>
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison if self._edgecolors == str('face'):
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
<matplotlib.text.Text at 0x10aaa6da0>
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison if self._edgecolors == str('face'):
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
<matplotlib.text.Text at 0x10aaa2908>
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison if self._edgecolors == str('face'):
lsl_dr.degree_hl.dropna().value_counts()
6 16788 4 4377 3 4228 5 4090 2 1667 0 1267 1 285 Name: degree_hl, dtype: int64
ax = lsl_dr.degree_hl.hist(bins=7)
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x10d4d0780>
(lsl_dr.age_int<6).mean()
0.19517271203848857
(lsl_dr.age<6).mean()
0.1296826855505703
Counts by year
lsl_dr.groupby('study_id').first()
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | age_test | domain | school | score | test_name | test_type | academic_year_start | tech_class | age_year | non_profound | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
study_id | |||||||||||||||||||||
0101-2003-0101 | initial_assessment_arm_1 | 2002-2003 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 9 | ... | 54 | Expressive Vocabulary | 0101 | 58 | PLS | EOWPVT | 2002 | Bimodal | 4 | False |
0101-2003-0102 | initial_assessment_arm_1 | 2003-2004 | 0 | 0 | 0 | 0 | 1 | 2 | 2 | 8 | ... | 44 | Articulation | 0101 | 72 | PLS | Goldman | 2003 | Bilateral HA | 3 | True |
0101-2004-0101 | initial_assessment_arm_1 | 2006-2007 | 0 | 1 | 0 | 0 | 0 | 6 | 6 | 8 | ... | 37 | Receptive Vocabulary | 0101 | 62 | PLS | PPVT | 2006 | Bimodal | 2 | True |
0101-2004-0102 | initial_assessment_arm_1 | 2004-2005 | 0 | 0 | 0 | 0 | 1 | 5 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2004 | Bimodal | 0 | True |
0101-2004-0103 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 1 | 4 | 4 | 8 | ... | 96 | Expressive Vocabulary | 0101 | 104 | CELF-4 | EVT | 2012 | Bilateral CI | 0 | False |
0101-2004-0104 | initial_assessment_arm_1 | 2004-2005 | 0 | 1 | 0 | 0 | 1 | 6 | 6 | 8 | ... | 32 | Articulation | 0101 | 84 | PLS | Goldman | 2004 | Bilateral HA | 0 | True |
0101-2004-0105 | initial_assessment_arm_1 | 2004-2005 | 0 | 0 | 0 | 0 | 2 | 6 | 6 | 9 | ... | 47 | Articulation | 0101 | 78 | CELF-P2 | Goldman | 2004 | Bimodal | 2 | False |
0101-2005-0101 | initial_assessment_arm_1 | 2006-2007 | 0 | 1 | 0 | 0 | 2 | 5 | 4 | 8 | ... | 28 | Articulation | 0101 | 61 | PLS | Goldman | 2006 | Bilateral HA | 2 | True |
0101-2005-0102 | initial_assessment_arm_1 | 2004-2005 | 0 | 1 | 0 | 0 | 2 | 3 | 2 | 9 | ... | 63 | Articulation | 0101 | 87 | CELF-P2 | Goldman | 2004 | Bilateral HA | 4 | True |
0101-2006-0101 | initial_assessment_arm_1 | 2005-2006 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2005 | Bimodal | 0 | False |
0101-2006-0104 | initial_assessment_arm_1 | 2006-2007 | 0 | 0 | 0 | 0 | 0 | 5 | 5 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2006 | Bilateral CI | 2 | False |
0101-2007-0104 | initial_assessment_arm_1 | 2007-2008 | 0 | 0 | 0 | 0 | 1 | 4 | 6 | 9 | ... | 41 | Articulation | 0101 | 122 | NaN | Goldman | 2007 | Bimodal | 4 | True |
0101-2007-0105 | initial_assessment_arm_1 | 2007-2008 | 0 | 1 | 0 | 0 | 0 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2007 | Bimodal | 11 | False |
0101-2007-0107 | initial_assessment_arm_1 | 2005-2006 | 0 | 0 | 0 | 0 | 1 | 4 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2005 | Bilateral HA | 0 | True |
0101-2008-0102 | initial_assessment_arm_1 | 2008-2009 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2008 | Bimodal | 14 | False |
0101-2008-0106 | initial_assessment_arm_1 | 2007-2008 | 0 | 0 | 0 | 0 | 1 | 4 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2007 | Bilateral HA | 0 | True |
0101-2009-0101 | initial_assessment_arm_1 | 2008-2009 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2008 | Bimodal | 6 | False |
0101-2010-0101 | initial_assessment_arm_1 | 2008-2009 | 0 | 1 | 0 | 0 | 1 | 6 | 6 | 9 | ... | 104 | Articulation | 0101 | 90 | CELF-4 | Arizonia | 2008 | Bilateral HA | 8 | True |
0101-2010-0103 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 2 | 4 | 3 | 8 | ... | 25 | Language | 0101 | 63 | PLS | receptive | 2010 | Bilateral HA | 0 | False |
0101-2010-0104 | initial_assessment_arm_1 | 2010-2011 | 0 | 1 | 3 | 0 | 1 | 2 | 2 | 8 | ... | 30 | Expressive Vocabulary | 0101 | 90 | PLS | EOWPVT | 2010 | Bilateral HA | 0 | False |
0101-2010-0105 | initial_assessment_arm_1 | 2011-2012 | 0 | 1 | 0 | 0 | 0 | 5 | 6 | 6 | ... | 30 | Language | 0101 | 66 | PLS | receptive | 2011 | Bilateral CI | 2 | False |
0101-2012-0101 | initial_assessment_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 0 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2013 | Bimodal | 2 | False |
0101-2013-0101 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 0 | 3 | 2 | 8 | ... | 12 | Language | 0101 | 58 | PLS | receptive | 2012 | Bilateral HA | 0 | True |
0101-2013-0103 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 2 | 6 | 6 | 9 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2012 | Bilateral CI | 4 | False |
0101-2013-0104 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 1 | 4 | 4 | 8 | ... | 12 | Language | 0101 | 83 | PLS | receptive | 2012 | Bilateral HA | 0 | True |
0101-2013-0112 | initial_assessment_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 1 | 3 | 6 | 9 | ... | 11 | Language | 0101 | 90 | PLS | receptive | 2012 | Bilateral HA | 0 | True |
0101-2013-0113 | initial_assessment_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 2 | 2 | 8 | ... | 4 | Language | 0101 | 96 | PLS | receptive | 2013 | Bimodal | 0 | True |
0101-2013-0114 | initial_assessment_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 0 | 3 | 3 | 8 | ... | 6 | Language | 0101 | 50 | PLS | receptive | 2013 | Bimodal | 0 | True |
0101-2013-0115 | initial_assessment_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 2 | 2 | 2 | 8 | ... | 11 | Language | 0101 | 79 | PLS | receptive | 2013 | Bilateral HA | 0 | True |
0101-2013-0116 | initial_assessment_arm_1 | 2013-2014 | 2 | 0 | 0 | 0 | 3 | 4 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2013 | Bimodal | 1 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1151-2012-0008 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 4 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2012-0009 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 4 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 1 | True |
1151-2012-0010 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 2 | True |
1151-2012-0011 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 4 | False |
1151-2012-0012 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 5 | False |
1151-2012-0013 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 2 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 5 | False |
1151-2012-0014 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 1 | 1 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
1151-2013-0001 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 2 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 4 | False |
1151-2013-0002 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 2 | 3 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2013-0003 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2013-0004 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 0 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2013-0005 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 1 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2013-0006 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 4 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2013-0007 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 4 | 2 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
1151-2013-0008 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 3 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2013-0009 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 2 | 6 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
1151-2013-0010 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 4 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 6 | False |
1151-2013-0011 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 1 | False |
1151-2013-0012 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 2 | 3 | 3 | 5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 3 | False |
1151-2014-0001 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 3 | 6 | 4 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 2 | False |
1151-2014-0002 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 2 | 4 | 7 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2014-0003 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 0 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2014-0004 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 2 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 3 | False |
1151-2014-0005 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 3 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 4 | False |
1151-2014-0006 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 0 | 4 | 0 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 3 | False |
1151-2014-0007 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 2 | 6 | 6 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 4 | False |
1151-2014-0008 | initial_assessment_arm_1 | NaN | 0 | 0 | 2 | 1 | 1 | 1 | 1 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | 5 | False |
1151-2014-0009 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 2 | 4 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral HA | 6 | True |
1151-2014-0010 | initial_assessment_arm_1 | NaN | 0 | 1 | 2 | 1 | 1 | 2 | 2 | 8 | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bilateral CI | 6 | False |
9308-2015-0002 | initial_assessment_arm_1 | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nan | Bimodal | NaN | False |
5511 rows × 76 columns
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');
disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1096d47b8>