In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)
In [3]:
metadata = lsl_dr_project.export_metadata()
In [4]:
# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
In [6]:
records = lsl_dr_project.export_records(fields=articulation_fields)
In [7]:
print(records[0]['study_id'])
0101-2003-0101
In [8]:
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})
In [9]:
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})
In [10]:
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})
In [11]:
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})
In [12]:
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
Out[12]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
11679 1147-2010-0064 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 ... 3 6 65 0 NaN NaN NaN NaN NaN NaN
11680 1147-2010-0064 year_1_complete_71_arm_1 2011-2012 0 NaN NaN NaN NaN NaN NaN ... 3 5 77 2 NaN NaN NaN NaN NaN NaN
11681 1147-2010-0064 year_2_complete_71_arm_1 2012-2013 0 NaN NaN NaN NaN NaN NaN ... 3 5 89 2 NaN NaN NaN NaN NaN NaN
11682 1147-2010-0064 year_3_complete_71_arm_1 2013-2014 0 NaN NaN NaN NaN NaN NaN ... 4 5 101 2 NaN NaN NaN NaN NaN NaN

4 rows × 46 columns

Attendance information

Several fields in the demographic data have missing values.

In [13]:
demographic_raw.head()
Out[13]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
0 0101-2003-0101 initial_assessment_arm_1 2002-2003 0 0 0 0 1 6 6 ... 2 2 54 2 NaN NaN NaN NaN NaN NaN
1 0101-2003-0101 year_1_complete_71_arm_1 2003-2004 0 NaN NaN NaN NaN NaN NaN ... 4 4 80 1 NaN NaN NaN NaN NaN NaN
2 0101-2003-0101 year_2_complete_71_arm_1 2004-2005 0 NaN NaN NaN NaN NaN NaN ... 4 4 80 2 NaN NaN NaN NaN NaN NaN
3 0101-2003-0101 year_3_complete_71_arm_1 2005-2006 0 NaN NaN NaN NaN NaN NaN ... 5 5 96 3 NaN NaN NaN NaN NaN NaN
4 0101-2003-0101 year_4_complete_71_arm_1 2006-2007 0 NaN NaN NaN NaN NaN NaN ... 5 5 109 2 NaN NaN NaN NaN NaN NaN

5 rows × 46 columns

We can fill missing values forward from previous observation (by study_id)

In [14]:
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id

Random check to make sure this worked

In [15]:
demographic[demographic.study_id=='1147-2010-0064']
Out[15]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
11679 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... 6 65 0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
11680 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... 5 77 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064
11681 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... 5 89 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064
11682 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... 5 101 2 NaN NaN NaN NaN NaN NaN 1147-2010-0064

4 rows × 46 columns

Demographic data without missing values:

In [16]:
demographic.head()
Out[16]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
8319 initial_assessment_arm_1 2012-2013 0 0 6 0 0 6 6 3 ... 1 9 1 NaN NaN NaN NaN NaN NaN 0735-2012-0008
5035 initial_assessment_arm_1 2007-2008 0 0 0 0 2 6 6 2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0522-2008-0011
8314 initial_assessment_arm_1 2011-2012 0 1 0 0 1 4 6 2 ... 1 7 0 NaN NaN NaN NaN NaN NaN 0735-2012-0006
8310 initial_assessment_arm_1 2011-2012 0 1 8 0 2 6 6 8 ... 1 2 1 NaN NaN NaN NaN NaN NaN 0735-2012-0005
5038 initial_assessment_arm_1 2008-2009 0 0 0 0 1 2 4 9 ... 3 62 4 NaN NaN NaN NaN NaN NaN 0522-2008-0012

5 rows × 46 columns

Cleaning languge dataset

5 language measures:

  • 3 versions of CELF
  • PLS
    • pls_ac_rs: PLS: Auditory Comprehension Raw Score
    • pls_ac_ss: PLS: Auditory Comprehension Standard Score
    • pls_ec_rs: PLS: Expressive Communication Raw Score
    • pls_ec_ss: PLS: Expressive Communication Standard Score
    • pls_tl_rs: PLS: Total Language Score Standard Score Total
    • pls_tl_ss: PLS: Total Language Score Standard Score
  • OWLS
    • age_test_owls: Age at time of testing (OWLS)
    • owls_lc_rs: OWLS: Listening Comprehension Raw Score
    • owls_lc_ss: OWLS: Listening Comprehension Standard Score
    • owls_oe_rs: OWLS: Oral Expression Raw Score
    • owls_oe_ss: OWLS: Oral Expression Standard Score
    • owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
    • owls_oc_ss: OWLS: Oral Composite Standard Score
    • owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
    • owls_wes_as: OWLS: Written Expression Scale Ability Score
    • owls_wes_ss: OWLS: Written Expression Scale Standard Score
    • owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
    • owls_lcss: OWLS: Language Composite Standard Score
In [17]:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type  expressive  receptive
test_name                       
CELF-4            539        489
CELF-P2          1170       1176
OWLS              871        877
PLS              2887       2896
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [18]:
language["school"] = language.study_id.str.slice(0,4)
In [19]:
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
Out[19]:
study_id redcap_event_name score test_type test_name school age_test domain
0 0101-2003-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language
5 0101-2003-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language

Cleaning articulation dataset

We converted the articulation dataset into a "long" format:

In [20]:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman                 4254
Arizonia                 490
Arizonia and Goldman      49
dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [21]:
articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [22]:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count    4790.000000
mean       69.175365
std        31.206700
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [23]:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
Out[23]:
study_id redcap_event_name test_type score school age_test domain
1 0101-2003-0101 year_1_complete_71_arm_1 Goldman 78 0101 80 Articulation
9 0101-2003-0102 initial_assessment_arm_1 Goldman 72 0101 44 Articulation
10 0101-2003-0102 year_1_complete_71_arm_1 Goldman 97 0101 54 Articulation
14 0101-2004-0101 year_2_complete_71_arm_1 Goldman 75 0101 53 Articulation
15 0101-2004-0101 year_3_complete_71_arm_1 Goldman 80 0101 66 Articulation

Cleaning demographic dataset

We excluded unwanted columns and rows for which age, gender or race were missing:

In [24]:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [25]:
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False    9677
True     2089
dtype: int64
There are 714 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

  • 0=no high school diploma
  • 1=high school
  • 2=undergraduate
  • 3=graduate

Category 6 (unknown) was recoded as missing.

In [26]:
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed:
6    4293
4    2485
3    1693
5    1282
2    1115
1     421
0     156
dtype: int64
mother_ed:
1    2808
2    2485
3    1282
0     577
dtype: int64

There are 5328 null values for mother_ed

Secondary diagnosis

In [27]:
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
In [28]:
demographic.secondary_diagnosis.value_counts()
Out[28]:
0    9132
1    2121
dtype: int64
In [29]:
demographic.secondary_diagnosis.mean()
Out[29]:
0.1884830711810184

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [30]:
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3173 null values for premature_weeks
In [31]:
demographic.premature_weeks.value_counts()
Out[31]:
0     7889
2      486
4      324
12     180
6      159
10     125
8      104
14      38
16       2
dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [32]:
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.tech_left = np.abs(demographic.tech_left - 3)

Substitute valid missing values for hearing loss:

In [33]:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [34]:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

  • 0=none
  • 1=one ear
  • 2=both ears.
In [35]:
demographic["baha"] = 0
demographic.baha = demographic.baha.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'baha'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'baha'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'baha'] = None
print("baha:")
print(demographic.drop_duplicates(subset='study_id').baha.value_counts())
print("There are {0} null values for baha".format(sum(demographic.baha.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
baha:
0    3683
1     132
2      57
dtype: int64
There are 1476 null values for baha

hearing_aid:
2    1706
0    1615
1     529
dtype: int64
There are 1516 null values for hearing_aid

cochlear:
0    2493
2     805
1     574
dtype: int64
There are 1476 null values for cochlear
12480

Identify bilateral and bimodal individuals:

In [36]:
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
In [37]:
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum()
Out[37]:
(2940, 4339, 1219)
In [175]:
demographic.drop_duplicates(subset='study_id')[['bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()
Out[175]:
bilateral_ci     805
bilateral_ha    1706
bimodal          334
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [38]:
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
In [39]:
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.baha==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.baha==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.baha==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()
Out[39]:
6    4339
3    2940
4    1219
0     680
1     470
2     294
8     168
7      19
5       8
dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [40]:
demographic.onset_1.unique()
Out[40]:
array([   6. ,    0. ,    3. ,    1. ,   25. ,    9. ,   13. ,   26. ,
          nan,   15. ,    2. ,   23. ,    7. ,   11. ,   24. ,   17. ,
         36. ,   28. ,   14. ,   48. ,   12. ,   29. ,   20. ,   27. ,
         22. ,    5. ,    4. ,   60. ,   32. ,   19. ,   18. ,   52. ,
         42. ,   21. ,   16. ,   30. ,    8. ,   10. ,  140. ,   61. ,
         66. ,   44. ,   41. ,   40. ,   49. ,   86. ,   33. ,  126. ,
          1.5,   85. ,   51. ,    2.5,   67. ,   39. ,   62. ,  133. ,
         38. ,  103. ,   54. ,   35. ,   43. ,   87. ,   83. ,   76. ,
         50. ,   37. ,  116. ,   68. ,   72. ,   92. ,   34. ,   57. ,
         97. ,   71. ,   55. ,   46. ,   65. ,   78. ,   45. ,   31. ,
        107. ,   64. ,   74. ,   77. ,   88. ,   81. ,   84. ,   80. ,
         53. ,   59. ,    0.5,   56. ,   98. ,   47. ,   58. ,   75. ,
         70. ,  119. ,   63. ,  154. ,   89. ,  152. ])
In [41]:
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [42]:
demographic.age_diag.isnull().sum()
Out[42]:
3848
In [43]:
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
In [44]:
import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')

Child has another diagnosed disability

In [45]:
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
In [46]:
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [47]:
demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [48]:
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race:
0    6523
2    2011
1    1156
3     861
6     587
8     463
7     219
4      58
5      25
dtype: int64
race:
0    6523
2    2011
1    1156
4    1133
3     861
dtype: int64
There are 796 null values for race

Recode implant technology variables

In [49]:
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
In [50]:
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
In [51]:
demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013',
                                   '0000-0000': np.nan})
In [53]:
demographic.age_amp.hist()
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x1199b3438>

Cleaning expressive vocabulary dataset

We converted the expressive vocabulary dataset to "long" format:

In [54]:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
In [55]:
expressive.test_type.value_counts()
Out[55]:
EVT               3113
EOWPVT            2305
EOWPVT and EVT     120
dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [56]:
expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [57]:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [58]:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
Out[58]:
study_id redcap_event_name score test_type school age_test domain
0 0101-2003-0101 initial_assessment_arm_1 58 EOWPVT 0101 54 Expressive Vocabulary
2 0101-2003-0101 year_2_complete_71_arm_1 84 EOWPVT 0101 80 Expressive Vocabulary
5 0101-2003-0101 year_5_complete_71_arm_1 90 EOWPVT 0101 113 Expressive Vocabulary
14 0101-2004-0101 year_2_complete_71_arm_1 90 EOWPVT 0101 53 Expressive Vocabulary
15 0101-2004-0101 year_3_complete_71_arm_1 87 EOWPVT 0101 66 Expressive Vocabulary

Cleaning receptive vocabulary dataset

We converted the receptive vocabulary data table to "long" format:

In [59]:
receptive.columns
Out[59]:
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
In [60]:
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [61]:
receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [62]:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
In [63]:
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 28 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [64]:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
Out[64]:
study_id redcap_event_name score test_type school age_test domain
2 0101-2003-0101 year_2_complete_71_arm_1 90 PPVT 0101 80 Receptive Vocabulary
5 0101-2003-0101 year_5_complete_71_arm_1 101 ROWPVT 0101 113 Receptive Vocabulary
9 0101-2003-0102 initial_assessment_arm_1 55 PPVT 0101 44 Receptive Vocabulary
10 0101-2003-0102 year_1_complete_71_arm_1 80 PPVT 0101 54 Receptive Vocabulary
11 0101-2003-0102 year_2_complete_71_arm_1 101 PPVT 0101 68 Receptive Vocabulary
In [65]:
receptive.study_id.unique().shape
Out[65]:
(2619,)

Merge datasets

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [66]:
test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [184]:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
In [185]:
lsl_dr.tail()
Out[185]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... known_synd synd_or_disab race academic_year_start age_test domain school score test_name test_type
31730 year_9_complete_71_arm_1 2012-2013 0 1 0 0 3 4 4 8 ... 0 1 0 NaN 104 Articulation 0521 100 NaN Goldman
31731 year_9_complete_71_arm_1 2012-2013 0 0 0 0 NaN 6 6 8 ... 0 0 0 NaN 138 Articulation 0310 92 NaN Goldman
31732 year_9_complete_71_arm_1 2012-2013 0 0 0 0 NaN 6 6 8 ... 0 0 0 NaN 137 Receptive Vocabulary 0310 65 NaN PPVT and ROWPVT
31733 year_9_complete_71_arm_1 2011-2012 0 1 0 0 3 6 6 8 ... 0 0 0 NaN 160 Expressive Vocabulary 0102 92 NaN EOWPVT
31734 year_9_complete_71_arm_1 2011-2012 0 1 0 0 3 6 6 8 ... 0 0 0 NaN 162 Receptive Vocabulary 0102 84 NaN ROWPVT

5 rows × 73 columns

In [186]:
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
Out[186]:
2013    6742
2012    6577
2011    5159
2010    4418
nan     3133
2009    2356
2014     984
2008     821
2007     531
2006     344
2005     276
2004     172
2003      90
2002      47
2001      35
1998      16
1999      15
2000      12
1997       6
1995       1
dtype: int64
In [190]:
current_year_only = True

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
In [192]:
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
In [193]:
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [194]:
if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')
In [195]:
lsl_dr.shape
Out[195]:
(6742, 73)
In [196]:
lsl_dr.study_id.unique().shape
Out[196]:
(2222,)
In [197]:
demographic.study_id.unique().shape
Out[197]:
(4898,)

Convert score to floating-point number

In [198]:
lsl_dr.score = lsl_dr.score.astype(float)

Plots of Demographic Data

In [199]:
plot_color = "#64AAE8"
In [200]:
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index(1)
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()
In [201]:
unique_students = demographic.drop_duplicates('study_id')
In [202]:
unique_students.shape
Out[202]:
(4898, 67)
In [203]:
unique_students.age.describe()
Out[203]:
count    4387.000000
mean       30.953271
std        28.380353
min         0.000000
25%         9.000000
50%        25.000000
75%        42.000000
max       298.000000
Name: age, dtype: float64
In [204]:
plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)
In [205]:
plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)
In [206]:
unique_students.prim_lang.count()
Out[206]:
4304
In [207]:
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)
In [208]:
unique_students.sib.count()
Out[208]:
3937
In [209]:
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4563 null values for age_amp
In [210]:
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))
In [211]:
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))
In [212]:
age_amp_counts.sum()
Out[212]:
2767
In [213]:
unique_students.age_amp.max()
Out[213]:
666.0
In [214]:
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
Out[214]:
<matplotlib.text.Text at 0x119135c50>
In [215]:
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
In [216]:
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
In [217]:
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)
In [218]:
unique_students.tech_right.count()
Out[218]:
3850
In [219]:
unique_students.tech_left.count()
Out[219]:
3836
In [220]:
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
In [221]:
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
In [222]:
unique_students.degree_hl_as.count()
Out[222]:
3755
In [223]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
In [224]:
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
In [225]:
unique_students.type_hl_ad.count()
Out[225]:
3681
In [226]:
unique_students.type_hl_as.count()
Out[226]:
3757
In [227]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)
In [228]:
demographic[demographic.study_id=='1147-2010-0064']
Out[228]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... bilateral_ha bimodal tech implant_category age_diag sex known_synd synd_or_disab race academic_year_start
11679 initial_assessment_arm_1 2010-2011 0 0 0 0 1 3 3 8 ... True False 0 6 51 Female 0 0 0 NaN
11680 year_1_complete_71_arm_1 2011-2012 0 0 0 0 1 3 3 8 ... True False 0 6 51 Female 0 0 0 NaN
11681 year_2_complete_71_arm_1 2012-2013 0 0 0 0 1 3 3 8 ... True False 0 6 51 Female 0 0 0 NaN
11682 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... True False 0 6 51 Female 0 0 0 NaN

4 rows × 67 columns

In [229]:
receptive[receptive.study_id=='1147-2010-0064']
Out[229]:
study_id redcap_event_name score test_type school age_test domain
11679 1147-2010-0064 initial_assessment_arm_1 96 PPVT 1147 63 Receptive Vocabulary
11680 1147-2010-0064 year_1_complete_71_arm_1 91 PPVT 1147 73 Receptive Vocabulary
11681 1147-2010-0064 year_2_complete_71_arm_1 93 PPVT 1147 85 Receptive Vocabulary
In [230]:
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
Out[230]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... known_synd synd_or_disab race academic_year_start age_test domain school score test_name test_type
23329 year_3_complete_71_arm_1 2013-2014 0 0 0 0 1 3 3 8 ... 0 0 0 2013 NaN NaN NaN NaN NaN NaN

1 rows × 73 columns

In [231]:
unique_students.type_hl_ad.count()
Out[231]:
3681
In [232]:
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[232]:
(2619,)
In [233]:
demographic.study_id.unique().shape
Out[233]:
(4898,)
In [234]:
receptive.study_id.unique().shape
Out[234]:
(2619,)
In [235]:
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[235]:
(1178,)
In [236]:
receptive_ids = receptive.study_id.unique()
In [237]:
demographic_ids = demographic.study_id.unique()
In [238]:
[s for s in receptive_ids if s not in demographic_ids]
Out[238]:
[]
In [239]:
def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    return summary[['Sample Size','Mean','SD','Min','Max']]
In [240]:
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Out[240]:
Sample Size Mean SD Min Max
age_test
2 67 97.641791 18.427336 44 144
3 247 96.113360 17.888865 47 139
4 290 92.762069 20.776166 0 140
5 201 89.457711 19.192172 0 130
6 119 89.058824 16.842387 51 137
7 70 84.885714 18.698634 43 124
8 51 82.078431 15.620299 46 114
9 40 83.550000 17.485452 53 120
10 46 76.847826 18.447484 20 109
11 138 81.920290 18.512202 29 132
In [241]:
receptive_summary.describe()
Out[241]:
Sample Size Mean SD Min Max
count 10.000000 10.000000 10.000000 10.000000 10.000000
mean 126.900000 87.431602 18.189100 33.300000 128.900000
std 90.298825 6.727914 1.381796 20.199285 11.789355
min 40.000000 76.847826 15.620299 0.000000 109.000000
25% 55.000000 82.446324 17.586305 22.250000 121.000000
50% 94.500000 86.972269 18.437410 43.500000 131.000000
75% 185.250000 91.935980 18.652026 46.750000 138.500000
max 290.000000 97.641791 20.776166 53.000000 144.000000
In [242]:
receptive_summary['Sample Size'].sum()
Out[242]:
1269
In [243]:
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
Out[243]:
(-0.5, 9.5)
In [244]:
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Out[244]:
Sample Size Mean SD Min Max
age_test
2 61 99.918033 18.484494 55 134
3 244 96.434426 21.047463 42 145
4 282 95.191489 21.178084 0 139
5 203 91.073892 20.455686 0 133
6 117 90.034188 19.400467 35 129
7 75 85.346667 15.480147 52 117
8 50 85.780000 14.044085 46 115
9 40 87.575000 14.101987 55 110
10 45 84.088889 15.545859 44 110
11 135 88.422222 15.986686 52 137
In [245]:
expressive_summary['Sample Size'].sum()
Out[245]:
1252
In [272]:
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 400)
else:
    plt.ylim(0, 1400)
In [247]:
articulation_summary = score_summary("Articulation")
articulation_summary
Out[247]:
Sample Size Mean SD Min Max
age_test
2 42 88.547619 17.935117 50 122
3 217 85.244240 19.399319 40 125
4 281 84.217082 22.830568 0 121
5 178 83.174157 21.460462 40 116
6 121 80.685950 22.692155 40 110
7 67 79.716418 22.760150 3 108
8 44 77.500000 19.553742 40 107
9 27 85.185185 18.193484 40 108
10 31 82.354839 19.142533 40 105
11 65 83.969231 20.182735 39 105
In [248]:
articulation_summary['Sample Size'].sum()
Out[248]:
1073
In [249]:
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

Language scores

In [250]:
lsl_dr.domain.unique()
Out[250]:
array([nan, 'Language', 'Articulation', 'Receptive Vocabulary',
       'Expressive Vocabulary'], dtype=object)
In [251]:
lsl_dr.test_type.unique()
Out[251]:
array([nan, 'receptive', 'expressive', 'Goldman', 'ROWPVT', 'EOWPVT',
       'EVT', 'PPVT', 'Arizonia', 'PPVT and ROWPVT',
       'Arizonia and Goldman', 'EOWPVT and EVT'], dtype=object)
In [252]:
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Out[252]:
Sample Size Mean SD Min Max
age_test
2 169 89.597633 22.966673 50 136
3 234 88.807692 18.895051 50 139
4 265 86.524528 19.317139 50 145
5 170 85.494118 20.003623 47 140
6 92 80.108696 20.074836 40 121
7 48 77.479167 18.109583 47 120
8 35 74.371429 19.395399 40 115
9 10 67.600000 18.337575 40 104
10 10 77.800000 16.771669 41 99
11 27 77.740741 18.289536 40 107
In [253]:
receptive_language_summary['Sample Size'].sum()
Out[253]:
1060
In [254]:
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [255]:
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Out[255]:
Sample Size Mean SD Min Max
age_test
2 168 91.125000 18.812401 50 150
3 234 84.739316 16.472776 53 139
4 265 81.535849 18.828966 48 136
5 173 80.508671 19.248261 48 137
6 103 76.689320 21.347741 40 140
7 58 71.120690 20.664469 40 114
8 37 66.783784 19.894353 40 116
9 10 62.700000 22.410563 40 106
10 10 80.800000 20.595577 40 107
11 26 73.769231 20.465205 40 112
In [256]:
expressive_language_summary['Sample Size'].sum()
Out[256]:
1084
In [257]:
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [258]:
(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')
Out[258]:
<matplotlib.text.Text at 0x11bac7518>
In [259]:
(unique_students.age/12.).describe()
Out[259]:
count    4387.000000
mean        2.579439
std         2.365029
min         0.000000
25%         0.750000
50%         2.083333
75%         3.500000
max        24.833333
Name: age, dtype: float64
In [260]:
def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
In [261]:
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
In [262]:
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
Out[262]:
<matplotlib.text.Text at 0x11bb42128>
In [263]:
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
In [264]:
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
Out[264]:
<matplotlib.text.Text at 0x11e02e5f8>
In [265]:
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
In [266]:
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
Out[266]:
<matplotlib.text.Text at 0x11917fef0>
In [267]:
lsl_dr.degree_hl.dropna().value_counts()
Out[267]:
6    2949
3     814
4     812
5     708
2     378
0     176
1      53
dtype: int64
In [268]:
ax = lsl_dr.degree_hl.hist(bins=7)
In [269]:
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
Out[269]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bf02f28>
In [270]:
(lsl_dr.age_int<6).mean()
Out[270]:
0.22619400771284484
In [271]:
(lsl_dr.age<6).mean()
Out[271]:
0.14224265796499555