# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Connect to database to import data for the three test domains and demographic information:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
metadata = lsl_dr_project.export_metadata()
# for i,j in zip(lsl_dr_project.field_names,
# lsl_dr_project.field_labels):
# print('{0}: \t{1}'.format(i,j))
Import each database from REDCap:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
records = lsl_dr_project.export_records(fields=articulation_fields)
print(records[0]['study_id'])
0101-2003-0101
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[888, 999, 9999]})
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11679 | 1147-2010-0064 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | ... | 3 | 6 | 65 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
11680 | 1147-2010-0064 | year_1_complete_71_arm_1 | 2011-2012 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3 | 5 | 77 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
11681 | 1147-2010-0064 | year_2_complete_71_arm_1 | 2012-2013 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3 | 5 | 89 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
11682 | 1147-2010-0064 | year_3_complete_71_arm_1 | 2013-2014 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 5 | 101 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
4 rows × 46 columns
Several fields in the demographic data have missing values.
demographic_raw.head()
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 2002-2003 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | ... | 2 | 2 | 54 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | 2003-2004 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 2004-2005 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 0101-2003-0101 | year_3_complete_71_arm_1 | 2005-2006 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 96 | 3 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 0101-2003-0101 | year_4_complete_71_arm_1 | 2006-2007 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 109 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 46 columns
We can fill missing values forward from previous observation (by study_id
)
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
Random check to make sure this worked
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11679 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 6 | 65 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
11680 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 77 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
11681 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 89 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
11682 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 5 | 101 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
4 rows × 46 columns
Demographic data without missing values:
demographic.head()
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8319 | initial_assessment_arm_1 | 2012-2013 | 0 | 0 | 6 | 0 | 0 | 6 | 6 | 3 | ... | 1 | 9 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | 0735-2012-0008 |
5035 | initial_assessment_arm_1 | 2007-2008 | 0 | 0 | 0 | 0 | 2 | 6 | 6 | 2 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0522-2008-0011 |
8314 | initial_assessment_arm_1 | 2011-2012 | 0 | 1 | 0 | 0 | 1 | 4 | 6 | 2 | ... | 1 | 7 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 0735-2012-0006 |
8310 | initial_assessment_arm_1 | 2011-2012 | 0 | 1 | 8 | 0 | 2 | 6 | 6 | 8 | ... | 1 | 2 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | 0735-2012-0005 |
5038 | initial_assessment_arm_1 | 2008-2009 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 9 | ... | 3 | 62 | 4 | NaN | NaN | NaN | NaN | NaN | NaN | 0522-2008-0012 |
5 rows × 46 columns
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 539 489 CELF-P2 1170 1176 OWLS 871 877 PLS 2887 2896 There are 0 null values for score
A school
variable was added, which is the first four columns of the study_id
:
language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
We converted the articulation dataset into a "long" format:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 4254 Arizonia 490 Arizonia and Goldman 49 dtype: int64 There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
articulation["school"] = articulation.study_id.str.slice(0,4)
The age was taken to be the Arizonia age if there are both test types:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 4790.000000 mean 69.175365 std 31.206700 min 23.000000 25% 47.000000 50% 60.000000 75% 81.000000 max 243.000000 Name: age_test, dtype: float64
Finally, we dropped unwanted columns and added a domain identification column for merging:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | Goldman | 78 | 0101 | 80 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | Goldman | 72 | 0101 | 44 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | Goldman | 97 | 0101 | 54 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | Goldman | 75 | 0101 | 53 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | Goldman | 80 | 0101 | 66 | Articulation |
We excluded unwanted columns and rows for which age, gender or race were missing:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})
Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False 9677 True 2089 dtype: int64 There are 714 null values for non_english
Mother's education (mother_ed
) and father's education (father_ed
) were both recoded to:
Category 6 (unknown) was recoded as missing.
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed: 6 4293 4 2485 3 1693 5 1282 2 1115 1 421 0 156 dtype: int64 mother_ed: 1 2808 2 2485 3 1282 0 577 dtype: int64 There are 5328 null values for mother_ed
Secondary diagnosis
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
demographic.secondary_diagnosis.value_counts()
0 9132 1 2121 dtype: int64
demographic.secondary_diagnosis.mean()
0.1884830711810184
Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3173 null values for premature_weeks
demographic.premature_weeks.value_counts()
0 7889 2 486 4 324 12 180 6 159 10 125 8 104 14 38 16 2 dtype: int64
Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.tech_left = np.abs(demographic.tech_left - 3)
Substitute valid missing values for hearing loss:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None
Create degree_hl
, which is the maximum level of hearing loss in either ear:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)
Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):
demographic["baha"] = 0
demographic.baha = demographic.baha.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'baha'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'baha'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'baha'] = None
print("baha:")
print(demographic.drop_duplicates(subset='study_id').baha.value_counts())
print("There are {0} null values for baha".format(sum(demographic.baha.isnull())))
demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))
demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
baha: 0 3683 1 132 2 57 dtype: int64 There are 1476 null values for baha hearing_aid: 2 1706 0 1615 1 529 dtype: int64 There are 1516 null values for hearing_aid cochlear: 0 2493 2 805 1 574 dtype: int64 There are 1476 null values for cochlear 12480
Identify bilateral and bimodal individuals:
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum()
(2940, 4339, 1219)
demographic.drop_duplicates(subset='study_id')[['bilateral_ci',
'bilateral_ha',
'bimodal']].sum()
bilateral_ci 805 bilateral_ha 1706 bimodal 334 dtype: int64
Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==0),
'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==0),
'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==1),
'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.baha==0),
'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.baha==0),
'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==1),
'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.baha==0),
'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==1),
'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==2),
'implant_category'] = 8
demographic.implant_category.value_counts()
6 4339 3 2940 4 1219 0 680 1 470 2 294 8 168 7 19 5 8 dtype: int64
Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.
demographic.onset_1.unique()
array([ 6. , 0. , 3. , 1. , 25. , 9. , 13. , 26. , nan, 15. , 2. , 23. , 7. , 11. , 24. , 17. , 36. , 28. , 14. , 48. , 12. , 29. , 20. , 27. , 22. , 5. , 4. , 60. , 32. , 19. , 18. , 52. , 42. , 21. , 16. , 30. , 8. , 10. , 140. , 61. , 66. , 44. , 41. , 40. , 49. , 86. , 33. , 126. , 1.5, 85. , 51. , 2.5, 67. , 39. , 62. , 133. , 38. , 103. , 54. , 35. , 43. , 87. , 83. , 76. , 50. , 37. , 116. , 68. , 72. , 92. , 34. , 57. , 97. , 71. , 55. , 46. , 65. , 78. , 45. , 31. , 107. , 64. , 74. , 77. , 88. , 81. , 84. , 80. , 53. , 59. , 0.5, 56. , 98. , 47. , 58. , 75. , 70. , 119. , 63. , 154. , 89. , 152. ])
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0,
# 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1
Number of null values for age_diag
demographic.age_diag.isnull().sum()
3848
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
import seaborn as sb
unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()
# ag = sb.factorplot("sex", data=unique_students,
# palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()),
# 'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')
Child has another diagnosed disability
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)
Missing sibling counts were properly encoded as None
(missing).
demographic.loc[demographic.sib==4, 'sib'] = None
We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race: 0 6523 2 2011 1 1156 3 861 6 587 8 463 7 219 4 58 5 25 dtype: int64 race: 0 6523 2 2011 1 1156 4 1133 3 861 dtype: int64 There are 796 null values for race
Recode implant technology variables
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan,
# 'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
# 'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
# '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
demographic['academic_year'] = demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'0000-0000': np.nan})
demographic.age_amp.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1199b3438>
We converted the expressive vocabulary dataset to "long" format:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 3113 EOWPVT 2305 EOWPVT and EVT 120 dtype: int64
A school
variable was added, which is the first four columns of the study_id
:
expressive["school"] = expressive.study_id.str.slice(0,4)
The age was taken to be the EOWPVT age if there are both test types:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
Finally, we dropped unwanted columns and added a domain identification column for merging:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 58 | EOWPVT | 0101 | 54 | Expressive Vocabulary |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 84 | EOWPVT | 0101 | 80 | Expressive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 90 | EOWPVT | 0101 | 113 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 90 | EOWPVT | 0101 | 53 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 87 | EOWPVT | 0101 | 66 | Expressive Vocabulary |
We converted the receptive vocabulary data table to "long" format:
receptive.columns
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
receptive["school"] = receptive.study_id.str.slice(0,4)
The age was taken to be the PPVT age if there are both test types:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 28 null values for age_test
Finally, we dropped unwanted columns and added a domain identification column for merging:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 90 | PPVT | 0101 | 80 | Receptive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 101 | ROWPVT | 0101 | 113 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | PPVT | 0101 | 44 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 80 | PPVT | 0101 | 54 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 101 | PPVT | 0101 | 68 | Receptive Vocabulary |
receptive.study_id.unique().shape
(2619,)
The four datasets were mereged into a single table. First, we concatenate the test scores data:
test_scores = pd.concat([articulation, expressive, receptive, language])
Then we perform a merge between the demographic data and the test scores data:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
lsl_dr.tail()
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | known_synd | synd_or_disab | race | academic_year_start | age_test | domain | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
31730 | year_9_complete_71_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 3 | 4 | 4 | 8 | ... | 0 | 1 | 0 | NaN | 104 | Articulation | 0521 | 100 | NaN | Goldman |
31731 | year_9_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | NaN | 6 | 6 | 8 | ... | 0 | 0 | 0 | NaN | 138 | Articulation | 0310 | 92 | NaN | Goldman |
31732 | year_9_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | NaN | 6 | 6 | 8 | ... | 0 | 0 | 0 | NaN | 137 | Receptive Vocabulary | 0310 | 65 | NaN | PPVT and ROWPVT |
31733 | year_9_complete_71_arm_1 | 2011-2012 | 0 | 1 | 0 | 0 | 3 | 6 | 6 | 8 | ... | 0 | 0 | 0 | NaN | 160 | Expressive Vocabulary | 0102 | 92 | NaN | EOWPVT |
31734 | year_9_complete_71_arm_1 | 2011-2012 | 0 | 1 | 0 | 0 | 3 | 6 | 6 | 8 | ... | 0 | 0 | 0 | NaN | 162 | Receptive Vocabulary | 0102 | 84 | NaN | ROWPVT |
5 rows × 73 columns
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
2013 6742 2012 6577 2011 5159 2010 4418 nan 3133 2009 2356 2014 984 2008 821 2007 531 2006 344 2005 276 2004 172 2003 90 2002 47 2001 35 1998 16 1999 15 2000 12 1997 6 1995 1 dtype: int64
current_year_only = True
if current_year_only:
lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language')
& (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
Export dataset
if current_year_only:
lsl_dr.to_csv('lsl_dr_current_year.csv')
else:
lsl_dr.to_csv('lsl_dr.csv')
lsl_dr.shape
(6742, 73)
lsl_dr.study_id.unique().shape
(2222,)
demographic.study_id.unique().shape
(4898,)
Convert score to floating-point number
lsl_dr.score = lsl_dr.score.astype(float)
plot_color = "#64AAE8"
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None,
ylim=None, title=None, **kwargs):
ax = kwargs.get('ax')
if ax is None:
f, ax = plt.subplots()
counts = series.value_counts().sort_index(1)
counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
if xlim is None:
ax.set_xlim(-0.5, len(counts)-0.5)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_ylabel('Count')
if labels is not None:
ax.set_xticklabels(labels)
if title:
ax.set_title(title)
for i,x in enumerate(counts):
ax.annotate('%i' % x, (i, x + label_offset))
# plt.gca().tight_layout()
unique_students = demographic.drop_duplicates('study_id')
unique_students.shape
(4898, 67)
unique_students.age.describe()
count 4387.000000 mean 30.953271 std 28.380353 min 0.000000 25% 9.000000 50% 25.000000 75% 42.000000 max 298.000000 Name: age, dtype: float64
plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)
plot_demo_data(unique_students.prim_lang,
('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'),
rot=70, color=plot_color)
unique_students.prim_lang.count()
4304
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'),
color=plot_color)
unique_students.sib.count()
3937
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months",
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years",
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]
demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4563 null values for age_amp
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
labels=amp_ages))
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
plt.annotate('%i' % x, (i, x + 10))
age_amp_counts.sum()
2767
unique_students.age_amp.max()
666.0
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
<matplotlib.text.Text at 0x119135c50>
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90,
ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90,
ax=axes[1], title='Left ear', color=plot_color)
unique_students.tech_right.count()
3850
unique_students.tech_left.count()
3836
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90,
color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90,
color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
unique_students.degree_hl_as.count()
3755
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
unique_students.type_hl_ad.count()
3681
unique_students.type_hl_as.count()
3757
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90,
title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90,
title='Left ear', ax=axes[1], color=plot_color)
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | bilateral_ha | bimodal | tech | implant_category | age_diag | sex | known_synd | synd_or_disab | race | academic_year_start | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11679 | initial_assessment_arm_1 | 2010-2011 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 | NaN |
11680 | year_1_complete_71_arm_1 | 2011-2012 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 | NaN |
11681 | year_2_complete_71_arm_1 | 2012-2013 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 | NaN |
11682 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | True | False | 0 | 6 | 51 | Female | 0 | 0 | 0 | NaN |
4 rows × 67 columns
receptive[receptive.study_id=='1147-2010-0064']
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
11679 | 1147-2010-0064 | initial_assessment_arm_1 | 96 | PPVT | 1147 | 63 | Receptive Vocabulary |
11680 | 1147-2010-0064 | year_1_complete_71_arm_1 | 91 | PPVT | 1147 | 73 | Receptive Vocabulary |
11681 | 1147-2010-0064 | year_2_complete_71_arm_1 | 93 | PPVT | 1147 | 85 | Receptive Vocabulary |
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | known_synd | synd_or_disab | race | academic_year_start | age_test | domain | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
23329 | year_3_complete_71_arm_1 | 2013-2014 | 0 | 0 | 0 | 0 | 1 | 3 | 3 | 8 | ... | 0 | 0 | 0 | 2013 | NaN | NaN | NaN | NaN | NaN | NaN |
1 rows × 73 columns
unique_students.type_hl_ad.count()
3681
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
(2619,)
demographic.study_id.unique().shape
(4898,)
receptive.study_id.unique().shape
(2619,)
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
(1178,)
receptive_ids = receptive.study_id.unique()
demographic_ids = demographic.study_id.unique()
[s for s in receptive_ids if s not in demographic_ids]
[]
def score_summary(domain, test_type=None):
subset = lsl_dr[lsl_dr.domain==domain].copy()
if test_type is not None:
subset = subset[subset.test_type==test_type]
subset['age_test'] = (subset.age_test/12).dropna().astype(int)
subset.loc[subset.age_test > 11, 'age_test'] = 11
subset = subset[subset.age_test>1]
byage = subset.groupby('age_test')
n = byage.study_id.count()
mean = byage.score.mean()
sd = byage.score.std()
min = byage.score.min()
max = byage.score.max()
summary = pd.DataFrame({'Sample Size':n, 'Mean':mean,
'SD':sd, 'Min':min, 'Max':max})
return summary[['Sample Size','Mean','SD','Min','Max']]
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 67 | 97.641791 | 18.427336 | 44 | 144 |
3 | 247 | 96.113360 | 17.888865 | 47 | 139 |
4 | 290 | 92.762069 | 20.776166 | 0 | 140 |
5 | 201 | 89.457711 | 19.192172 | 0 | 130 |
6 | 119 | 89.058824 | 16.842387 | 51 | 137 |
7 | 70 | 84.885714 | 18.698634 | 43 | 124 |
8 | 51 | 82.078431 | 15.620299 | 46 | 114 |
9 | 40 | 83.550000 | 17.485452 | 53 | 120 |
10 | 46 | 76.847826 | 18.447484 | 20 | 109 |
11 | 138 | 81.920290 | 18.512202 | 29 | 132 |
receptive_summary.describe()
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 126.900000 | 87.431602 | 18.189100 | 33.300000 | 128.900000 |
std | 90.298825 | 6.727914 | 1.381796 | 20.199285 | 11.789355 |
min | 40.000000 | 76.847826 | 15.620299 | 0.000000 | 109.000000 |
25% | 55.000000 | 82.446324 | 17.586305 | 22.250000 | 121.000000 |
50% | 94.500000 | 86.972269 | 18.437410 | 43.500000 | 131.000000 |
75% | 185.250000 | 91.935980 | 18.652026 | 46.750000 | 138.500000 |
max | 290.000000 | 97.641791 | 20.776166 | 53.000000 | 144.000000 |
receptive_summary['Sample Size'].sum()
1269
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
(-0.5, 9.5)
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 61 | 99.918033 | 18.484494 | 55 | 134 |
3 | 244 | 96.434426 | 21.047463 | 42 | 145 |
4 | 282 | 95.191489 | 21.178084 | 0 | 139 |
5 | 203 | 91.073892 | 20.455686 | 0 | 133 |
6 | 117 | 90.034188 | 19.400467 | 35 | 129 |
7 | 75 | 85.346667 | 15.480147 | 52 | 117 |
8 | 50 | 85.780000 | 14.044085 | 46 | 115 |
9 | 40 | 87.575000 | 14.101987 | 55 | 110 |
10 | 45 | 84.088889 | 15.545859 | 44 | 110 |
11 | 135 | 88.422222 | 15.986686 | 52 | 137 |
expressive_summary['Sample Size'].sum()
1252
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
plt.ylim(0, 400)
else:
plt.ylim(0, 1400)
articulation_summary = score_summary("Articulation")
articulation_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 42 | 88.547619 | 17.935117 | 50 | 122 |
3 | 217 | 85.244240 | 19.399319 | 40 | 125 |
4 | 281 | 84.217082 | 22.830568 | 0 | 121 |
5 | 178 | 83.174157 | 21.460462 | 40 | 116 |
6 | 121 | 80.685950 | 22.692155 | 40 | 110 |
7 | 67 | 79.716418 | 22.760150 | 3 | 108 |
8 | 44 | 77.500000 | 19.553742 | 40 | 107 |
9 | 27 | 85.185185 | 18.193484 | 40 | 108 |
10 | 31 | 82.354839 | 19.142533 | 40 | 105 |
11 | 65 | 83.969231 | 20.182735 | 39 | 105 |
articulation_summary['Sample Size'].sum()
1073
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
Language scores
lsl_dr.domain.unique()
array([nan, 'Language', 'Articulation', 'Receptive Vocabulary', 'Expressive Vocabulary'], dtype=object)
lsl_dr.test_type.unique()
array([nan, 'receptive', 'expressive', 'Goldman', 'ROWPVT', 'EOWPVT', 'EVT', 'PPVT', 'Arizonia', 'PPVT and ROWPVT', 'Arizonia and Goldman', 'EOWPVT and EVT'], dtype=object)
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 169 | 89.597633 | 22.966673 | 50 | 136 |
3 | 234 | 88.807692 | 18.895051 | 50 | 139 |
4 | 265 | 86.524528 | 19.317139 | 50 | 145 |
5 | 170 | 85.494118 | 20.003623 | 47 | 140 |
6 | 92 | 80.108696 | 20.074836 | 40 | 121 |
7 | 48 | 77.479167 | 18.109583 | 47 | 120 |
8 | 35 | 74.371429 | 19.395399 | 40 | 115 |
9 | 10 | 67.600000 | 18.337575 | 40 | 104 |
10 | 10 | 77.800000 | 16.771669 | 41 | 99 |
11 | 27 | 77.740741 | 18.289536 | 40 | 107 |
receptive_language_summary['Sample Size'].sum()
1060
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 168 | 91.125000 | 18.812401 | 50 | 150 |
3 | 234 | 84.739316 | 16.472776 | 53 | 139 |
4 | 265 | 81.535849 | 18.828966 | 48 | 136 |
5 | 173 | 80.508671 | 19.248261 | 48 | 137 |
6 | 103 | 76.689320 | 21.347741 | 40 | 140 |
7 | 58 | 71.120690 | 20.664469 | 40 | 114 |
8 | 37 | 66.783784 | 19.894353 | 40 | 116 |
9 | 10 | 62.700000 | 22.410563 | 40 | 106 |
10 | 10 | 80.800000 | 20.595577 | 40 | 107 |
11 | 26 | 73.769231 | 20.465205 | 40 | 112 |
expressive_language_summary['Sample Size'].sum()
1084
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')
<matplotlib.text.Text at 0x11bac7518>
(unique_students.age/12.).describe()
count 4387.000000 mean 2.579439 std 2.365029 min 0.000000 25% 0.750000 50% 2.083333 75% 3.500000 max 24.833333 Name: age, dtype: float64
def calc_difference(x, col='a_fo', jitter=True):
if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
return None
diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
if jitter:
diff += np.random.normal(scale=0.05)
if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
print(x['funct_out_age'])
return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
<matplotlib.text.Text at 0x11bb42128>
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
<matplotlib.text.Text at 0x11e02e5f8>
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
<matplotlib.text.Text at 0x11917fef0>
lsl_dr.degree_hl.dropna().value_counts()
6 2949 3 814 4 812 5 708 2 378 0 176 1 53 dtype: int64
ax = lsl_dr.degree_hl.hist(bins=7)
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x11bf02f28>
(lsl_dr.age_int<6).mean()
0.22619400771284484
(lsl_dr.age<6).mean()
0.14224265796499555