# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Connect to database to import data for the three test domains and demographic information:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
metadata = lsl_dr_project.export_metadata()
# for i,j in zip(lsl_dr_project.field_names,
# lsl_dr_project.field_labels):
# print('{0}: \t{1}'.format(i,j))
Import each database from REDCap:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None,
'na_values':[999, 9999]})
records = lsl_dr_project.export_records(fields=articulation_fields)
print(records[0]['study_id'])
0101-2002-0101
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf',
'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year', 'academic_year_rv',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[888, 999, 9999]})
demographic_raw.academic_year_rv.value_counts()
2013.0 2501 2012.0 2429 2014.0 2170 2011.0 1901 2010.0 1609 2009.0 1021 2015.0 931 2008.0 436 2007.0 277 2006.0 189 2005.0 138 2004.0 89 2003.0 65 2002.0 36 2001.0 24 2000.0 12 1999.0 12 1998.0 9 15.0 3 1997.0 2 1995.0 1 Name: academic_year_rv, dtype: int64
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
study_id | redcap_event_name | academic_year | academic_year_rv | hl | gender | race | prim_lang | sib | mother_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
14665 | 1147-2010-0064 | initial_assessment_arm_1 | 2010-2011 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | ... | 3.0 | 6.0 | 65.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
14666 | 1147-2010-0064 | year_1_complete_71_arm_1 | 2011-2012 | 2011.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 5.0 | 77.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
14667 | 1147-2010-0064 | year_2_complete_71_arm_1 | 2012-2013 | 2012.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 5.0 | 89.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
14668 | 1147-2010-0064 | year_3_complete_71_arm_1 | 2013-2014 | 2013.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 5.0 | 101.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 rows × 47 columns
Several fields in the demographic data have missing values.
demographic_raw.head()
study_id | redcap_event_name | academic_year | academic_year_rv | hl | gender | race | prim_lang | sib | mother_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 2002-2003 | 2002.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | ... | 2.0 | 2.0 | 54.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0101-2002-0101 | year_1_complete_71_arm_1 | 2003-2004 | 2003.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 4.0 | 80.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 2004-2005 | 2004.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 4.0 | 80.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 0101-2002-0101 | year_3_complete_71_arm_1 | 2005-2006 | 2005.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 5.0 | 5.0 | 96.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 0101-2002-0101 | year_4_complete_71_arm_1 | 2006-2007 | 2006.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 5.0 | 5.0 | 109.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 47 columns
We can fill missing values forward from previous observation (by study_id
)
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) if __name__ == '__main__': /Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) app.launch_new_instance()
Random check to make sure this worked
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | academic_year_rv | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
14665 | initial_assessment_arm_1 | 2010-2011 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 6.0 | 65.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
14666 | year_1_complete_71_arm_1 | 2011-2012 | 2011.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 5.0 | 77.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
14667 | year_2_complete_71_arm_1 | 2012-2013 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 5.0 | 89.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
14668 | year_3_complete_71_arm_1 | 2013-2014 | 2013.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 5.0 | 101.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
4 rows × 47 columns
Demographic data without missing values:
demographic.head()
redcap_event_name | academic_year | academic_year_rv | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | initial_assessment_arm_1 | 2002-2003 | 2002.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | 6.0 | ... | 2.0 | 54.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2002-0101 |
8001 | initial_assessment_arm_1 | 2009-2010 | 2009.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | 3.0 | ... | 5.0 | 138.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-1814 |
7995 | initial_assessment_arm_1 | 2009-2010 | 2009.0 | 0.0 | 0.0 | 6.0 | 0.0 | 0.0 | 4.0 | 3.0 | ... | 4.0 | 78.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-1756 |
7990 | initial_assessment_arm_1 | 2009-2010 | 2009.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 3.0 | 4.0 | ... | 4.0 | 77.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-1744 |
7987 | initial_assessment_arm_1 | 2009-2010 | 2009.0 | 0.0 | 1.0 | 1.0 | 0.0 | 2.0 | 6.0 | 6.0 | ... | 4.0 | 118.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-1741 |
5 rows × 47 columns
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 627 545 CELF-P2 1511 1516 OWLS 1093 1099 PLS 3572 3584 There are 0 null values for score
A school
variable was added, which is the first four columns of the study_id
:
language["school"] = language.study_id.str.slice(0,4)
language_subtest = language[["study_id", "redcap_event_name", "score", "test_type",
"test_name", "school", "age_test",
'celfp_ss_ss', 'celfp_ws_ss',
'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss',
'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']]
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
We converted the articulation dataset into a "long" format:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 5437 Arizonia 503 Arizonia and Goldman 73 Name: test_type, dtype: int64 There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
articulation["school"] = articulation.study_id.str.slice(0,4)
The age was taken to be the Arizonia age if there are both test types:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 6011.000000 mean 68.857095 std 30.613506 min 23.000000 25% 47.000000 50% 60.000000 75% 81.000000 max 243.000000 Name: age_test, dtype: float64
Finally, we dropped unwanted columns and added a domain identification column for merging:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|
1 | 0101-2002-0101 | year_1_complete_71_arm_1 | Goldman | 78.0 | 0101 | 80.0 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | Goldman | 72.0 | 0101 | 44.0 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | Goldman | 97.0 | 0101 | 54.0 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | Goldman | 75.0 | 0101 | 53.0 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | Goldman | 80.0 | 0101 | 66.0 | Articulation |
We excluded unwanted columns and rows for which age, gender or race were missing:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})
Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False 11986 True 2688 Name: non_english, dtype: int64 There are 622 null values for non_english
Mother's education (mother_ed
) and father's education (father_ed
) were both recoded to:
Category 6 (unknown) was recoded as missing.
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed: 6.0 5340 4.0 3140 3.0 2127 5.0 1696 2.0 1489 1.0 498 0.0 222 Name: _mother_ed, dtype: int64 mother_ed: 1.0 3616 2.0 3140 3.0 1696 0.0 720 Name: mother_ed, dtype: int64 There are 6124 null values for mother_ed
Secondary diagnosis
demographic.shape
(15296, 49)
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
demographic.secondary_diagnosis.value_counts()
0.0 11224 1.0 2526 Name: secondary_diagnosis, dtype: int64
demographic.secondary_diagnosis.mean()
0.18370909090909091
Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3394 null values for premature_weeks
demographic.premature_weeks.value_counts()
0.0 10190 2.0 609 4.0 386 12.0 202 6.0 186 10.0 160 8.0 124 14.0 42 16.0 3 Name: premature_weeks, dtype: int64
Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):
demographic.tech_ad.value_counts()
1.0 5221 0.0 4497 7.0 1588 5.0 1056 2.0 529 6.0 433 8.0 76 9.0 70 4.0 31 3.0 26 10.0 4 Name: tech_ad, dtype: int64
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None
demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
demographic.tech_left.value_counts()
2.0 6919 3.0 4579 0.0 1925 4.0 61 1.0 18 Name: tech_left, dtype: int64
demographic.tech_right.value_counts()
2.0 6841 3.0 5006 0.0 1588 4.0 70 1.0 26 Name: tech_right, dtype: int64
Substitute valid missing values for hearing loss:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None
Create degree_hl
, which is the maximum level of hearing loss in either ear:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)
Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):
demographic.columns
Index(['redcap_event_name', 'academic_year', 'academic_year_rv', 'hl', 'male', 'race', 'prim_lang', 'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp', 'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2', 'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad', 'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as', 'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled', 'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv', 'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses', 'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed', 'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left', 'degree_hl'], dtype='object')
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))
demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))
demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad: 0 4770 1 4 Name: oad, dtype: int64 There are 1711 null values for OAD hearing_aid: 2 2249 0 1669 1 824 Name: hearing_aid, dtype: int64 There are 1765 null values for hearing_aid cochlear: 0 3203 2 935 1 636 Name: cochlear, dtype: int64 There are 1711 null values for cochlear 15296
Identify bilateral and bimodal individuals:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
(3718, 5632, 1437, 2149)
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci',
'bilateral_ha',
'bimodal']].sum()
unilateral_ci 636 bilateral_ci 935 bilateral_ha 2249 bimodal 384 dtype: int64
Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0),
'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1),
'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2),
'implant_category'] = 8
demographic.implant_category.value_counts()
6 5632 3 3718 4 1437 1 1034 0 692 8 13 2 12 7 5 5 1 Name: implant_category, dtype: int64
Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.
demographic.onset_1.unique()
array([ 15. , 80. , 14. , 62. , 2. , 49. , 19. , 9. , 18. , 4. , 0. , 10. , 12. , 1. , 31. , 16. , 26. , 61. , 46. , 24. , 36. , 21. , 52. , 30. , 7. , 51. , 8. , 3. , 6. , 17. , 50. , 23. , 42. , 37. , 33. , 60. , 13. , nan, 22. , 28. , 82. , 34. , 35. , 38. , 95. , 5. , 59. , 25. , 48. , 1.5, 41. , 53. , 88. , 29. , 27. , 39. , 65. , 64. , 47. , 79. , 97. , 96. , 107. , 77. , 74. , 11. , 84. , 20. , 45. , 32. , 81. , 55. , 58. , 70. , 154. , 54. , 57. , 72. , 43. , 83. , 78. , 116. , 40. , 44. , 119. , 63. , 66. , 140. , 56. , 87. , 76. , 68. , 92. , 86. , 126. , 85. , 133. , 103. , 67. , 71. , 2.5, 98. , 75. , 0.5, 89. , 152. ])
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0,
# 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1
Number of null values for age_diag
demographic.age_diag.isnull().sum()
3864
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
import seaborn as sb
unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()
# ag = sb.factorplot("sex", data=unique_students,
# palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()),
# 'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')
unique_students.shape
(5522, 64)
Child has another diagnosed disability
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)
Missing sibling counts were properly encoded as None
(missing).
demographic.loc[demographic.sib==4, 'sib'] = None
We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race: 0.0 7955 2.0 2649 1.0 1407 3.0 1074 6.0 751 8.0 542 7.0 241 4.0 66 5.0 37 Name: _race, dtype: int64 race: 0.0 7955 2.0 2649 1.0 1407 4.0 1396 3.0 1074 Name: race, dtype: int64 There are 815 null values for race
Recode implant technology variables
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan,
# 'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
# 'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
# '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012',
'0000-0000': np.nan}).str.replace('*', '-').unique()
array(['2002-2003', '2009-2010', '2011-2012', '2009-2011', '2006-2007', '2007-2008', '2008-2009', '2014-2015', '2013-2014', '2012-2013', nan, '2015-2016', '2010-2011', '2014', '2005-2006', '2004-2005', '2003-2004', '2010-2011 2010-2011', '2011', '2010', '2009', '2012', '2013', '1995-1996', '1999-2000', '2000-2001', '1998-1999', '1997-1998', '2001-2002', '2014-15', '2015-2015', '2015', '2041-2015', '2015-2106', '22014-2015', '2014-1015'], dtype=object)
demographic['academic_year'] = demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
'2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
'2015-2015': '2014-2015', '2009-2011': '2009-2010',
'0000-0000': np.nan}).str.replace('*', '-')
Removed entries that don't contain dashes
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')),
'academic_year'] = np.nan
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
demographic.age_amp.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x11a63eeb8>
We converted the expressive vocabulary dataset to "long" format:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 3881 EOWPVT 2784 EOWPVT and EVT 149 Name: test_type, dtype: int64
A school
variable was added, which is the first four columns of the study_id
:
expressive["school"] = expressive.study_id.str.slice(0,4)
The age was taken to be the EOWPVT age if there are both test types:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
Finally, we dropped unwanted columns and added a domain identification column for merging:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 58.0 | EOWPVT | 0101 | 54.0 | Expressive Vocabulary |
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 84.0 | EOWPVT | 0101 | 80.0 | Expressive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 90.0 | EOWPVT | 0101 | 113.0 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 90.0 | EOWPVT | 0101 | 53.0 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 87.0 | EOWPVT | 0101 | 66.0 | Expressive Vocabulary |
We converted the receptive vocabulary data table to "long" format:
receptive.columns
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
receptive["school"] = receptive.study_id.str.slice(0,4)
The age was taken to be the PPVT age if there are both test types:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 23 null values for age_test
Finally, we dropped unwanted columns and added a domain identification column for merging:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 90.0 | PPVT | 0101 | 80.0 | Receptive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 101.0 | ROWPVT | 0101 | 113.0 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55.0 | PPVT | 0101 | 44.0 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 80.0 | PPVT | 0101 | 54.0 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 101.0 | PPVT | 0101 | 68.0 | Receptive Vocabulary |
receptive.study_id.unique().shape
(3108,)
The four datasets were mereged into a single table. First, we concatenate the test scores data:
test_scores = pd.concat([articulation, expressive, receptive, language])
Then we perform a merge between the demographic data and the test scores data:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
lsl_dr.tail()
redcap_event_name | academic_year | academic_year_rv | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | ... | sex | known_synd | synd_or_disab | race | age_test | domain | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
39154 | year_9_complete_71_arm_1 | 2011-2012 | 2011.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 6.0 | 6.0 | ... | Male | 0.0 | 0.0 | 0.0 | 162 | Receptive Vocabulary | 0102 | 84 | NaN | ROWPVT |
39155 | year_9_complete_71_arm_1 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 0.0 | 0.0 | NaN | 203 | Expressive Vocabulary | 1147 | 95 | NaN | EVT |
39156 | year_9_complete_71_arm_1 | NaN | NaN | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 6.0 | 6.0 | ... | Female | 0.0 | 0.0 | 2.0 | 119 | Articulation | 0624 | 102 | NaN | Goldman |
39157 | year_9_complete_71_arm_1 | NaN | NaN | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 6.0 | 6.0 | ... | Female | 0.0 | 0.0 | 2.0 | 119 | Expressive Vocabulary | 0624 | 96 | NaN | EVT |
39158 | year_9_complete_71_arm_1 | NaN | NaN | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 6.0 | 6.0 | ... | Female | 0.0 | 0.0 | 2.0 | 119 | Receptive Vocabulary | 0624 | 82 | NaN | PPVT |
5 rows × 74 columns
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
2013 6952 2012 6641 2014 6144 2011 5256 2010 4457 nan 3164 2009 2502 2015 1646 2008 827 2007 536 2006 345 2005 286 2004 172 2003 90 2002 47 2001 37 1998 16 1999 16 2000 12 1997 6 2201 5 2041 1 1995 1 Name: academic_year_start, dtype: int64
current_year_only = False
if current_year_only:
lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language')
& (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
Export dataset
if current_year_only:
lsl_dr.to_csv('lsl_dr_current_year.csv')
else:
lsl_dr.to_csv('lsl_dr.csv')
lsl_dr.shape
(39159, 79)
lsl_dr.study_id.unique().shape
(5898,)
demographic.study_id.unique().shape
(5898,)
Convert score to floating-point number
lsl_dr.score = lsl_dr.score.astype(float)
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
lsl_dr.domain.dropna().unique()
array(['Expressive Vocabulary', 'Language', 'Articulation', 'Receptive Vocabulary'], dtype=object)
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
tech_class Bilateral CI 0.45 Bilateral HA 0.58 Bimodal 0.50 Name: prim_lang, dtype: float64
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
tech_class Bilateral CI 0.08 Bilateral HA 0.87 Bimodal 0.31 Name: non_profound, dtype: float64
lsl_dr['age_test_year'] = -999
lsl_dr.loc[lsl_dr.age_test.notnull(), 'age_test_year'] = (lsl_dr.age_test/12).dropna().astype(int)
lsl_dr.loc[lsl_dr.age_test_year==-999, 'age_test_year'] = np.nan
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
ax.set_ylim(40, 120)
ax.set_xticks(range(2,7))
ax.set_title(dom)
ppvt_only = lsl_dr[lsl_dr.test_type=='PPVT']
ppvt_only.age_year.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x11ad41be0>
ppvt_345 = ppvt_only[ppvt_only.age_test_year.isin([3,4,5])]
ppvt_345.score.describe()
count 2576.000000 mean 92.463509 std 20.127618 min 20.000000 25% 79.000000 50% 94.000000 75% 107.000000 max 153.000000 Name: score, dtype: float64
ppvt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})
score | ||||
---|---|---|---|---|
min | max | median | count_nonzero | |
age_test_year | ||||
3.0 | 36.0 | 153.0 | 95.0 | 873.0 |
4.0 | 20.0 | 149.0 | 94.0 | 936.0 |
5.0 | 20.0 | 142.0 | 91.0 | 767.0 |
lsl_dr.test_type.value_counts()
expressive 6803 receptive 6744 Goldman 5437 PPVT 4445 EVT 3881 EOWPVT 2784 ROWPVT 2346 Arizonia 503 PPVT and ROWPVT 199 EOWPVT and EVT 149 Arizonia and Goldman 73 Name: test_type, dtype: int64
evt_only = lsl_dr[lsl_dr.test_type=='EVT']
evt_only.age_test_year.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x11ad78470>
evt_345 = evt_only[evt_only.age_test_year.isin([3,4,5])]
evt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})
score | ||||
---|---|---|---|---|
min | max | median | count_nonzero | |
age_test_year | ||||
3.0 | 19.0 | 147.0 | 100.0 | 767.0 |
4.0 | 20.0 | 146.0 | 99.0 | 813.0 |
5.0 | 20.0 | 150.0 | 97.0 | 644.0 |
pls_only = (language[(language.test_name=='PLS')]
.convert_objects(convert_numeric=True))
pls_only['age_year'] = np.floor(pls_only.age_test/12).astype(int)
pls_345 = pls_only[pls_only.age_year.isin([3,4,5])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric. from ipykernel import kernelapp as app
(pls_345.assign(normal_limits=pls_345.score>=85).groupby(['age_year', 'test_type'])
.agg({'score':[min, max, np.median, len],
'normal_limits': np.mean}))
score | normal_limits | |||||
---|---|---|---|---|---|---|
min | max | median | len | mean | ||
age_year | test_type | |||||
3 | expressive | 50.0 | 145.0 | 78.0 | 813.0 | 0.355474 |
receptive | 50.0 | 140.0 | 80.0 | 813.0 | 0.404674 | |
4 | expressive | 50.0 | 141.0 | 73.0 | 602.0 | 0.284053 |
receptive | 50.0 | 136.0 | 77.0 | 606.0 | 0.381188 | |
5 | expressive | 50.0 | 138.0 | 68.0 | 304.0 | 0.259868 |
receptive | 50.0 | 129.0 | 73.0 | 306.0 | 0.290850 |
celf_only = (language_subtest[(language_subtest.test_name=='CELF-P2')]
.convert_objects(convert_numeric=True))
celf_only['age_year'] = np.floor(celf_only.age_test/12).astype(int)
celf_46 = celf_only[celf_only.age_year.isin([4,6])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric. from ipykernel import kernelapp as app
subtests = ['celfp_ss_ss', 'celfp_ws_ss',
'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss',
'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']
(celf_46.groupby('age_year')
.agg({st:np.median for st in subtests})).T
age_year | 4 | 6 |
---|---|---|
celfp_wct_ss | 10.0 | 8.0 |
celfp_ev_ss | 8.0 | 5.0 |
celfp_wcr_ss | 10.0 | 10.0 |
celfp_wce_ss | 9.0 | 7.0 |
celfp_ss_ss | 8.0 | 5.0 |
celfp_ws_ss | 6.0 | 4.0 |
celfp_fd_ss | 8.0 | 4.0 |
celfp_rs_ss | 7.0 | 4.0 |
celfp_bc_ss | 9.0 | 4.5 |
def calc_norm_range(dataset):
return (dataset.groupby('study_id').score.mean() >= 85).mean()
Mean score of each domain
calc_norm_range(lsl_dr[(lsl_dr.domain=='Language')
& (lsl_dr.test_type=='expressive')
& (lsl_dr.age_test_year.isin([3,4,5]))])
0.40083217753120665
for year in range(2010, 2014):
value = calc_norm_range(lsl_dr[(lsl_dr.domain=='Language')
& (lsl_dr.test_type=='receptive') & (lsl_dr.academic_year_rv==year)
& (lsl_dr.age_test_year.isin([3,4,5]))]).round(2)
print('{}: {}'.format(year, value))
2010: 0.53 2011: 0.48 2012: 0.5 2013: 0.55
calc_norm_range(lsl_dr[(lsl_dr.domain=='Receptive Vocabulary')
& (lsl_dr.age_test_year.isin([3,4,5]))])
0.63506493506493511
calc_norm_range(lsl_dr[(lsl_dr.domain=='Expressive Vocabulary')
& (lsl_dr.age_test_year.isin([3,4,5]))])
0.64257555847568992
calc_norm_range(lsl_dr[(lsl_dr.domain=='Articulation')
& (lsl_dr.age_test_year.isin([3,4,5]))])
0.49158249158249157
Summary statistics
(lsl_dr.groupby('study_id').male.first().dropna()==0).mean()
0.46830858384643242
(lsl_dr.groupby('study_id').race.first().dropna()==0).mean()
0.54349040789718761
(lsl_dr.groupby('study_id').non_english.first().dropna()==False).sum()
4404
lsl_dr.groupby('study_id').sib.first().dropna().count()
5040
lsl_dr.groupby('study_id').onset_1.first().dropna().count()
4161
lsl_dr.groupby('study_id').age_amp.first().dropna().median()
8.0
lsl_dr.groupby('study_id').age_int.first().dropna().median()
9.0
lsl_dr.groupby('study_id').age.first().dropna().count()
5404
_unique = lsl_dr.dropna(subset=['age_disenrolled', 'age']).groupby('study_id').first()
(_unique.age_disenrolled - _unique.age).count()
1868
synd_cause = lsl_dr.groupby('study_id').synd_cause.first().dropna()
synd_cause = synd_cause[synd_cause<3]
synd_cause.value_counts()/synd_cause.value_counts().sum()
1.0 0.885766 0.0 0.091387 2.0 0.022847 Name: synd_cause, dtype: float64
etiology = lsl_dr.groupby('study_id').etiology.first().dropna()
etiology = etiology[etiology<3]
etiology.value_counts()/etiology.value_counts().sum()
1.0 0.791393 0.0 0.163977 2.0 0.044630 Name: etiology, dtype: float64
lsl_dr['concerns'] = lsl_dr.etiology_2.replace({0:'none', 4:'none', 1:'mild', 2:'moderate', 3:'severe'})
lsl_dr.groupby('study_id').concerns.last().dropna().value_counts()
none 3328 moderate 546 mild 436 severe 344 Name: concerns, dtype: int64
plot_color = "#64AAE8"
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None,
ylim=None, title=None, **kwargs):
ax = kwargs.get('ax')
if ax is None:
f, ax = plt.subplots()
counts = series.value_counts().sort_index()
counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
if xlim is None:
ax.set_xlim(-0.5, len(counts)-0.5)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_ylabel('Count')
if labels is not None:
ax.set_xticklabels(labels)
if title:
ax.set_title(title)
for i,x in enumerate(counts):
ax.annotate('%i' % x, (i, x + label_offset))
# plt.gca().tight_layout()
unique_students = demographic.drop_duplicates('study_id')
unique_students.shape
(5898, 68)
unique_students.age.describe()
count 5381.000000 mean 29.302360 std 27.507899 min 0.000000 25% 8.000000 50% 24.000000 75% 40.000000 max 298.000000 Name: age, dtype: float64
plot_demo_data(unique_students.male,
('Female', 'Male'), label_offset=20, color=plot_color)
plot_demo_data(unique_students.prim_lang,
('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'),
rot=70, color=plot_color)
unique_students.prim_lang.count()
5419
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'),
color=plot_color)
unique_students.sib.count()
5013
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months",
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years",
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]
demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4806 null values for age_amp
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
labels=amp_ages))
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) if __name__ == '__main__':
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
plt.ylim(0,1000)
for i,x in enumerate(age_amp_counts):
plt.annotate('%i' % x, (i, x + 10))
age_amp_counts.sum()
3806
unique_students.age_amp.max()
173.0
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
<matplotlib.text.Text at 0x11d496198>
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color, ylim=(0, 3000))
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90,
ax=axes[0], title='Right ear', color=plot_color)
plot_demo_data(unique_students.tech_left, tech_cats, rot=90,
ax=axes[1], title='Left ear', color=plot_color)
unique_students.tech_right.count()
4742
unique_students.tech_left.count()
4734
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90,
color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90,
color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
unique_students.degree_hl_as.count()
4642
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
unique_students.type_hl_ad.count()
4563
unique_students.type_hl_as.count()
4660
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90,
title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90,
title='Left ear', ax=axes[1], color=plot_color)
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | academic_year_rv | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | ... | bilateral_ci | bilateral_ha | bimodal | tech | implant_category | age_diag | sex | known_synd | synd_or_disab | race | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
14665 | initial_assessment_arm_1 | 2010-2011 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
14666 | year_1_complete_71_arm_1 | 2011-2012 | 2011.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
14667 | year_2_complete_71_arm_1 | 2012-2013 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
14668 | year_3_complete_71_arm_1 | 2013-2014 | 2013.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
4 rows × 68 columns
receptive[receptive.study_id=='1147-2010-0064']
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
14665 | 1147-2010-0064 | initial_assessment_arm_1 | 96.0 | PPVT | 1147 | 63.0 | Receptive Vocabulary |
14666 | 1147-2010-0064 | year_1_complete_71_arm_1 | 91.0 | PPVT | 1147 | 73.0 | Receptive Vocabulary |
14667 | 1147-2010-0064 | year_2_complete_71_arm_1 | 93.0 | PPVT | 1147 | 85.0 | Receptive Vocabulary |
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
redcap_event_name | academic_year | academic_year_rv | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | ... | school | score | test_name | test_type | academic_year_start | tech_class | age_year | non_profound | age_test_year | concerns | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5947 | initial_assessment_arm_1 | 2010-2011 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 91.0 | NaN | EVT | 2010 | Bilateral HA | 4.0 | True | 5.0 | NaN |
5948 | initial_assessment_arm_1 | 2010-2011 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 96.0 | NaN | PPVT | 2010 | Bilateral HA | 4.0 | True | 5.0 | NaN |
5949 | initial_assessment_arm_1 | 2010-2011 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 101.0 | PLS | receptive | 2010 | Bilateral HA | 4.0 | True | 4.0 | NaN |
5950 | initial_assessment_arm_1 | 2010-2011 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 87.0 | PLS | expressive | 2010 | Bilateral HA | 4.0 | True | 4.0 | NaN |
15880 | year_1_complete_71_arm_1 | 2011-2012 | 2011.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 86.0 | NaN | EVT | 2011 | Bilateral HA | 4.0 | True | 6.0 | NaN |
15881 | year_1_complete_71_arm_1 | 2011-2012 | 2011.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 91.0 | NaN | PPVT | 2011 | Bilateral HA | 4.0 | True | 6.0 | NaN |
23735 | year_2_complete_71_arm_1 | 2012-2013 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 95.0 | NaN | EVT | 2012 | Bilateral HA | 4.0 | True | 7.0 | NaN |
23736 | year_2_complete_71_arm_1 | 2012-2013 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 1147 | 93.0 | NaN | PPVT | 2012 | Bilateral HA | 4.0 | True | 7.0 | NaN |
32791 | year_3_complete_71_arm_1 | 2013-2014 | 2013.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | NaN | NaN | NaN | NaN | 2013 | Bilateral HA | 4.0 | True | NaN | NaN |
9 rows × 80 columns
unique_students.type_hl_ad.count()
4563
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
(3108,)
demographic.study_id.unique().shape
(5898,)
receptive.study_id.unique().shape
(3108,)
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
(3108,)
receptive_ids = receptive.study_id.unique()
demographic_ids = demographic.study_id.unique()
[s for s in receptive_ids if s not in demographic_ids]
[]
def score_summary(domain, test_type=None):
subset = lsl_dr[lsl_dr.domain==domain].copy()
if test_type is not None:
subset = subset[subset.test_type==test_type]
subset['age_test'] = (subset.age_test/12).dropna().astype(int)
subset.loc[subset.age_test > 11, 'age_test'] = 11
subset = subset[subset.age_test>1]
byage = subset.groupby('age_test')
n = byage.study_id.count()
mean = byage.score.mean()
sd = byage.score.std()
min = byage.score.min()
max = byage.score.max()
summary = pd.DataFrame({'Sample Size':n, 'Mean':mean,
'SD':sd, 'Min':min, 'Max':max})
summary.index = summary.index.values.astype(int)
return summary[['Sample Size','Mean','SD','Min','Max']]
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 424 | 93.759434 | 17.998914 | 40.0 | 144.0 |
3 | 1444 | 92.173823 | 19.124304 | 0.0 | 153.0 |
4 | 1582 | 90.716814 | 20.243070 | 0.0 | 149.0 |
5 | 1189 | 89.994113 | 18.050597 | 0.0 | 142.0 |
6 | 678 | 85.961652 | 16.160065 | 40.0 | 154.0 |
7 | 442 | 83.244344 | 16.113797 | 40.0 | 130.0 |
8 | 313 | 80.651757 | 17.500828 | 20.0 | 132.0 |
9 | 235 | 78.629787 | 17.568035 | 25.0 | 160.0 |
10 | 194 | 76.479381 | 17.488178 | 20.0 | 123.0 |
11 | 463 | 78.539957 | 18.944497 | 20.0 | 134.0 |
receptive_summary.describe()
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.0000 | 10.000000 |
mean | 696.400000 | 85.015106 | 17.919228 | 20.5000 | 142.100000 |
std | 515.522863 | 6.356443 | 1.280871 | 16.4063 | 12.068784 |
min | 194.000000 | 76.479381 | 16.113797 | 0.0000 | 123.000000 |
25% | 340.750000 | 79.135280 | 17.491340 | 5.0000 | 132.500000 |
50% | 452.500000 | 84.602998 | 17.783475 | 20.0000 | 143.000000 |
75% | 1061.250000 | 90.536139 | 18.721022 | 36.2500 | 152.000000 |
max | 1582.000000 | 93.759434 | 20.243070 | 40.0000 | 160.000000 |
receptive_summary['Sample Size'].sum()
6964
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
<matplotlib.text.Text at 0x129f16518>
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 403 | 92.885856 | 21.971304 | 23.0 | 145.0 |
3 | 1389 | 93.531317 | 21.386317 | 19.0 | 147.0 |
4 | 1557 | 92.419396 | 21.762937 | 0.0 | 146.0 |
5 | 1160 | 91.680172 | 19.999878 | 0.0 | 150.0 |
6 | 676 | 87.002959 | 18.252711 | 20.0 | 146.0 |
7 | 441 | 84.133787 | 15.653573 | 38.0 | 131.0 |
8 | 304 | 83.976974 | 16.415685 | 34.0 | 122.0 |
9 | 221 | 82.036199 | 16.163330 | 36.0 | 145.0 |
10 | 188 | 82.085106 | 15.380841 | 40.0 | 122.0 |
11 | 464 | 84.771552 | 17.333085 | 18.0 | 146.0 |
expressive_summary['Sample Size'].sum()
6803
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
plt.ylim(0, 800)
else:
plt.ylim(0, 1800)
articulation_summary = score_summary("Articulation")
articulation_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 306 | 85.254902 | 14.944281 | 50.0 | 122.0 |
3 | 1215 | 83.656790 | 18.416468 | 40.0 | 126.0 |
4 | 1407 | 83.461265 | 20.866057 | 0.0 | 123.0 |
5 | 1089 | 82.844812 | 20.790949 | 39.0 | 120.0 |
6 | 638 | 79.460815 | 21.809311 | 39.0 | 115.0 |
7 | 415 | 78.101205 | 22.341971 | 3.0 | 112.0 |
8 | 268 | 79.313433 | 21.212468 | 40.0 | 107.0 |
9 | 195 | 81.497436 | 20.757901 | 39.0 | 109.0 |
10 | 149 | 81.516779 | 20.128507 | 40.0 | 107.0 |
11 | 326 | 81.733129 | 19.477465 | 39.0 | 105.0 |
articulation_summary['Sample Size'].sum()
6008
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);
Language scores
lsl_dr.domain.unique()
array(['Expressive Vocabulary', 'Language', 'Articulation', nan, 'Receptive Vocabulary'], dtype=object)
lsl_dr.test_type.unique()
array(['EOWPVT', 'receptive', 'expressive', 'Goldman', nan, 'ROWPVT', 'Arizonia', 'EVT', 'PPVT', 'Arizonia and Goldman', 'EOWPVT and EVT', 'PPVT and ROWPVT'], dtype=object)
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 988 | 86.411943 | 22.293414 | 50.0 | 150.0 |
3 | 1408 | 84.969460 | 19.728716 | 50.0 | 144.0 |
4 | 1391 | 85.321352 | 19.453493 | 43.0 | 145.0 |
5 | 985 | 83.943147 | 18.823820 | 47.0 | 140.0 |
6 | 515 | 78.081553 | 17.745640 | 11.0 | 127.0 |
7 | 331 | 76.129909 | 18.941810 | 40.0 | 123.0 |
8 | 201 | 74.880597 | 19.700652 | 40.0 | 127.0 |
9 | 55 | 70.363636 | 21.026759 | 40.0 | 120.0 |
10 | 47 | 79.617021 | 20.802961 | 40.0 | 120.0 |
11 | 69 | 77.101449 | 21.432620 | 40.0 | 139.0 |
receptive_language_summary['Sample Size'].sum()
5990
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 981 | 88.450561 | 18.587983 | 50.0 | 150.0 |
3 | 1410 | 82.344681 | 17.569380 | 20.0 | 147.0 |
4 | 1382 | 80.683792 | 19.533977 | 45.0 | 141.0 |
5 | 1006 | 78.666998 | 20.106123 | 45.0 | 144.0 |
6 | 536 | 71.820896 | 19.421195 | 6.0 | 140.0 |
7 | 354 | 67.426554 | 21.096070 | 40.0 | 124.0 |
8 | 211 | 68.312796 | 21.588506 | 40.0 | 119.0 |
9 | 55 | 65.163636 | 21.369556 | 40.0 | 108.0 |
10 | 47 | 77.574468 | 23.968952 | 40.0 | 119.0 |
11 | 68 | 73.882353 | 22.531258 | 40.0 | 132.0 |
expressive_language_summary['Sample Size'].sum()
6050
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
(unique_students.age/12.).describe()
count 5381.000000 mean 2.441863 std 2.292325 min 0.000000 25% 0.666667 50% 2.000000 75% 3.333333 max 24.833333 Name: age, dtype: float64
def calc_difference(x, col='a_fo', jitter=True):
if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
return None
diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
if jitter:
diff += np.random.normal(scale=0.05)
if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
print(x['funct_out_age'])
return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
<matplotlib.text.Text at 0x11bb3a198>
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
<matplotlib.text.Text at 0x11a9d55f8>
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
<matplotlib.text.Text at 0x11a95cc50>
lsl_dr.degree_hl.dropna().value_counts()
6.0 17779 4.0 4722 3.0 4595 5.0 4336 2.0 1788 0.0 1273 1.0 307 Name: degree_hl, dtype: int64
ax = lsl_dr.degree_hl.hist(bins=7)
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x12056a5f8>
(lsl_dr.age_int<6).mean()
0.20646594652570291
(lsl_dr.age<6).mean()
0.13450292397660818
Counts by year
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');
disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x120162dd8>
The following counts of ages allows for multiple tests per year
test_age = (lsl_dr.assign(age_test_year=(lsl_dr.age_test/12))
.dropna(subset=['age_test'])[['study_id','age_test','age_test_year']])
test_age.assign(age_year=test_age.age_test_year.astype(int)).age_year.value_counts().sort_index()
0 507 1 1007 2 3102 3 6866 4 7319 5 5429 6 3043 7 1983 8 1297 9 761 10 625 11 480 12 314 13 201 14 152 15 99 16 84 17 33 18 12 19 3 20 8 21 2 60 2 Name: age_year, dtype: int64
This summary counts children only once per year:
from itertools import chain
unique_age_vals = (test_age.assign(age_year=test_age.age_test_year.astype(int))
.groupby('study_id')
.age_year.unique().tolist())
pd.Series(np.concatenate(unique_age_vals)).value_counts().sort_index()
0 242 1 476 2 986 3 1631 4 1721 5 1302 6 766 7 510 8 371 9 263 10 215 11 172 12 114 13 81 14 63 15 47 16 39 17 18 18 7 19 1 20 3 21 1 60 1 dtype: int64
test_age.assign(age_year=test_age.age_test_year.astype(int)).groupby('study_id').age_year.last().value_counts().sort_index()
0 113 1 201 2 419 3 505 4 803 5 707 6 392 7 233 8 161 9 109 10 85 11 81 12 58 13 32 14 29 15 22 16 23 17 14 18 6 19 1 20 2 21 1 Name: age_year, dtype: int64