In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)
In [3]:
metadata = lsl_dr_project.export_metadata()
In [4]:
# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
In [6]:
records = lsl_dr_project.export_records(fields=articulation_fields)
In [7]:
print(records[0]['study_id'])
0101-2002-0101
In [8]:
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})
In [9]:
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})
In [10]:
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf',
                   'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
                   'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
                   'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})
In [11]:
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})
In [12]:
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
Out[12]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
14329 1147-2010-0064 initial_assessment_arm_1 2010-2011 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 3.0 6.0 65.0 0.0 NaN NaN NaN NaN NaN NaN
14330 1147-2010-0064 year_1_complete_71_arm_1 2011-2012 0.0 NaN NaN NaN NaN NaN NaN ... 3.0 5.0 77.0 2.0 NaN NaN NaN NaN NaN NaN
14331 1147-2010-0064 year_2_complete_71_arm_1 2012-2013 0.0 NaN NaN NaN NaN NaN NaN ... 3.0 5.0 89.0 2.0 NaN NaN NaN NaN NaN NaN
14332 1147-2010-0064 year_3_complete_71_arm_1 2013-2014 0.0 NaN NaN NaN NaN NaN NaN ... 4.0 5.0 101.0 2.0 NaN NaN NaN NaN NaN NaN

4 rows × 46 columns

Attendance information

Several fields in the demographic data have missing values.

In [13]:
demographic_raw.head()
Out[13]:
study_id redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
0 0101-2002-0101 initial_assessment_arm_1 2002-2003 0.0 0.0 0.0 0.0 1.0 6.0 6.0 ... 2.0 2.0 54.0 2.0 NaN NaN NaN NaN NaN NaN
1 0101-2002-0101 year_1_complete_71_arm_1 2003-2004 0.0 NaN NaN NaN NaN NaN NaN ... 4.0 4.0 80.0 1.0 NaN NaN NaN NaN NaN NaN
2 0101-2002-0101 year_2_complete_71_arm_1 2004-2005 0.0 NaN NaN NaN NaN NaN NaN ... 4.0 4.0 80.0 2.0 NaN NaN NaN NaN NaN NaN
3 0101-2002-0101 year_3_complete_71_arm_1 2005-2006 0.0 NaN NaN NaN NaN NaN NaN ... 5.0 5.0 96.0 3.0 NaN NaN NaN NaN NaN NaN
4 0101-2002-0101 year_4_complete_71_arm_1 2006-2007 0.0 NaN NaN NaN NaN NaN NaN ... 5.0 5.0 109.0 2.0 NaN NaN NaN NaN NaN NaN

5 rows × 46 columns

We can fill missing values forward from previous observation (by study_id)

In [14]:
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()

Random check to make sure this worked

In [15]:
demographic[demographic.study_id=='1147-2010-0064']
Out[15]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
14329 initial_assessment_arm_1 2010-2011 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 6.0 65.0 0.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
14330 year_1_complete_71_arm_1 2011-2012 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 5.0 77.0 2.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
14331 year_2_complete_71_arm_1 2012-2013 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 5.0 89.0 2.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
14332 year_3_complete_71_arm_1 2013-2014 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 5.0 101.0 2.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064

4 rows × 46 columns

Demographic data without missing values:

In [16]:
demographic.head()
Out[16]:
redcap_event_name academic_year hl gender race prim_lang sib mother_ed father_ed premature_age ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
0 initial_assessment_arm_1 2002-2003 0.0 0.0 0.0 0.0 1.0 6.0 6.0 9.0 ... 2.0 54.0 2.0 NaN NaN NaN NaN NaN NaN 0101-2002-0101
7884 initial_assessment_arm_1 2008-2009 0.0 0.0 0.0 0.0 2.0 5.0 5.0 7.0 ... 5.0 53.0 0.0 NaN NaN NaN NaN NaN NaN 0628-2005-2156
7882 initial_assessment_arm_1 2009-2010 0.0 1.0 2.0 0.0 3.0 6.0 6.0 8.0 ... 3.0 48.0 2.0 NaN NaN NaN NaN NaN NaN 0628-2005-2081
7876 initial_assessment_arm_1 2009-2010 0.0 0.0 2.0 0.0 0.0 3.0 6.0 5.0 ... 4.0 86.0 2.0 NaN NaN NaN NaN NaN NaN 0628-2005-1986
7872 initial_assessment_arm_1 2009-2010 0.0 0.0 2.0 0.0 0.0 6.0 6.0 8.0 ... 5.0 94.0 0.0 NaN NaN NaN NaN NaN NaN 0628-2005-1978

5 rows × 46 columns

Cleaning languge dataset

5 language measures:

  • 3 versions of CELF
  • PLS
    • pls_ac_rs: PLS: Auditory Comprehension Raw Score
    • pls_ac_ss: PLS: Auditory Comprehension Standard Score
    • pls_ec_rs: PLS: Expressive Communication Raw Score
    • pls_ec_ss: PLS: Expressive Communication Standard Score
    • pls_tl_rs: PLS: Total Language Score Standard Score Total
    • pls_tl_ss: PLS: Total Language Score Standard Score
  • OWLS
    • age_test_owls: Age at time of testing (OWLS)
    • owls_lc_rs: OWLS: Listening Comprehension Raw Score
    • owls_lc_ss: OWLS: Listening Comprehension Standard Score
    • owls_oe_rs: OWLS: Oral Expression Raw Score
    • owls_oe_ss: OWLS: Oral Expression Standard Score
    • owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
    • owls_oc_ss: OWLS: Oral Composite Standard Score
    • owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
    • owls_wes_as: OWLS: Written Expression Scale Ability Score
    • owls_wes_ss: OWLS: Written Expression Scale Standard Score
    • owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
    • owls_lcss: OWLS: Language Composite Standard Score
In [17]:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type  expressive  receptive
test_name                       
CELF-4            611        537
CELF-P2          1448       1453
OWLS             1075       1081
PLS              3447       3459
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [18]:
language["school"] = language.study_id.str.slice(0,4)
In [19]:
language_subtest = language[["study_id", "redcap_event_name", "score", "test_type", 
                             "test_name", "school", "age_test", 
                             'celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']]
In [20]:
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
Out[20]:
study_id redcap_event_name score test_type test_name school age_test domain
0 0101-2002-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language
5 0101-2002-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language

Cleaning articulation dataset

We converted the articulation dataset into a "long" format:

In [21]:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman                 5286
Arizonia                 502
Arizonia and Goldman      73
Name: test_type, dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [22]:
articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [23]:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count    5859.000000
mean       68.853559
std        30.782839
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [24]:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
Out[24]:
study_id redcap_event_name test_type score school age_test domain
1 0101-2002-0101 year_1_complete_71_arm_1 Goldman 78.0 0101 80.0 Articulation
9 0101-2003-0102 initial_assessment_arm_1 Goldman 72.0 0101 44.0 Articulation
10 0101-2003-0102 year_1_complete_71_arm_1 Goldman 97.0 0101 54.0 Articulation
14 0101-2004-0101 year_2_complete_71_arm_1 Goldman 75.0 0101 53.0 Articulation
15 0101-2004-0101 year_3_complete_71_arm_1 Goldman 80.0 0101 66.0 Articulation

Cleaning demographic dataset

We excluded unwanted columns and rows for which age, gender or race were missing:

In [25]:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [26]:
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False    11660
True      2590
Name: non_english, dtype: int64
There are 710 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

  • 0=no high school diploma
  • 1=high school
  • 2=undergraduate
  • 3=graduate

Category 6 (unknown) was recoded as missing.

In [27]:
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed:
6.0    5198
4.0    3039
3.0    2053
5.0    1638
2.0    1436
1.0     491
0.0     215
Name: _mother_ed, dtype: int64
mother_ed:
1.0    3489
2.0    3039
3.0    1638
0.0     706
Name: mother_ed, dtype: int64

There are 6088 null values for mother_ed

Secondary diagnosis

In [28]:
demographic.shape
Out[28]:
(14960, 48)
In [29]:
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
In [30]:
demographic.secondary_diagnosis.value_counts()
Out[30]:
0.0    10979
1.0     2485
Name: secondary_diagnosis, dtype: int64
In [31]:
demographic.secondary_diagnosis.mean()
Out[31]:
0.18456625074272134

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [32]:
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3492 null values for premature_weeks
In [33]:
demographic.premature_weeks.value_counts()
Out[33]:
0.0     9803
2.0      585
4.0      373
12.0     205
6.0      183
10.0     154
8.0      120
14.0      42
16.0       3
Name: premature_weeks, dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [34]:
demographic.tech_ad.value_counts()
Out[34]:
1.0     5090
0.0     4379
7.0     1554
5.0     1022
2.0      519
6.0      426
8.0       76
9.0       70
4.0       29
3.0       28
10.0       3
Name: tech_ad, dtype: int64
In [35]:
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
In [36]:
demographic.tech_left.value_counts()
Out[36]:
2.0    6754
3.0    4455
0.0    1877
4.0      60
1.0      20
Name: tech_left, dtype: int64
In [37]:
demographic.tech_right.value_counts()
Out[37]:
2.0    6663
3.0    4881
0.0    1554
4.0      70
1.0      28
Name: tech_right, dtype: int64

Substitute valid missing values for hearing loss:

In [38]:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [39]:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

  • 0=none
  • 1=one ear
  • 2=both ears.
In [40]:
demographic.columns
Out[40]:
Index(['redcap_event_name', 'academic_year', 'hl', 'male', 'race', 'prim_lang',
       'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp',
       'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2',
       'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad',
       'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as',
       'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled',
       'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv',
       'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses',
       'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed',
       'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left',
       'degree_hl'],
      dtype='object')
In [41]:
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad:
0    4676
1       4
2       2
Name: oad, dtype: int64
There are 1711 null values for OAD

hearing_aid:
2    2190
0    1648
1     813
Name: hearing_aid, dtype: int64
There are 1764 null values for hearing_aid

cochlear:
0    3120
2     924
1     638
Name: cochlear, dtype: int64
There are 1711 null values for cochlear
14960

Identify bilateral and bimodal individuals:

In [42]:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
In [43]:
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
Out[43]:
(3603, 5485, 1423, 2130)
In [44]:
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()
Out[44]:
unilateral_ci     638
bilateral_ci      924
bilateral_ha     2190
bimodal           385
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [45]:
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
In [46]:
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()
Out[46]:
6    5485
3    3603
4    1423
1     999
0     687
8      15
2      12
7       5
5       1
Name: implant_category, dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [47]:
demographic.onset_1.unique()
Out[47]:
array([  15. ,    4. ,    0. ,   26. ,   36. ,   24. ,   80. ,   14. ,
         62. ,    2. ,   49. ,   19. ,   23. ,   18. ,    9. ,    nan,
         10. ,   12. ,    1. ,    5. ,   30. ,    7. ,   51. ,    8. ,
          3. ,   17. ,   50. ,   31. ,   34. ,   28. ,   35. ,   38. ,
         95. ,   42. ,   13. ,   16. ,   61. ,   46. ,   22. ,   53. ,
         59. ,   88. ,    6. ,   37. ,   96. ,   52. ,   64. ,   65. ,
         48. ,   97. ,   25. ,   47. ,   79. ,  107. ,   74. ,   77. ,
         84. ,   60. ,   41. ,   33. ,   39. ,   27. ,   11. ,   20. ,
         21. ,   45. ,   29. ,   32. ,   81. ,    1.5,   55. ,   70. ,
         58. ,  154. ,   54. ,   78. ,   43. ,   57. ,   83. ,   44. ,
         72. ,  116. ,   40. ,  119. ,   63. ,   66. ,   56. ,   87. ,
         76. ,   68. ,   92. ,  140. ,   86. ,  126. ,   85. ,  133. ,
        103. ,   67. ,   71. ,    2.5,   98. ,   75. ,    0.5,  152. ,
         89. ])
In [48]:
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [49]:
demographic.age_diag.isnull().sum()
Out[49]:
3993
In [50]:
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
In [51]:
import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')

Child has another diagnosed disability

In [52]:
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
In [53]:
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [54]:
demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [55]:
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race:
0.0    7801
2.0    2554
1.0    1367
3.0    1044
6.0     725
8.0     531
7.0     239
4.0      65
5.0      33
Name: _race, dtype: int64
race:
0.0    7801
2.0    2554
1.0    1367
4.0    1354
3.0    1044
Name: race, dtype: int64
There are 840 null values for race

Recode implant technology variables

In [56]:
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
In [57]:
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
In [58]:
demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012',
                                   '0000-0000': np.nan}).str.replace('*', '-').unique()
Out[58]:
array(['2002-2003', '2008-2009', '2009-2010', nan, '2009-2011',
       '2006-2007', '2007-2008', '2011-2012', '2015-2016', '2014-2015',
       '2013-2014', '2012-2013', '2010-2011', '2005-2006', '2014', '2012-',
       '2006-2007 ', '2003-2004', '2015-206', '2004-2005',
       '              2010-2011                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   2010-2011',
       '2012', '2011', '2010', '2009', '2013', '1995-1996', '1998-1999',
       '2001-2002', '1999-2000', '2000-2001', '1997-1998', '2014-15',
       '2015', '2015-2015', '2014-2015 ', '2041-2015', '2015-2106',
       '22014-2015', '2014-1015', '2012-2013 '], dtype=object)
In [59]:
demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
                          '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
                          '2015-2015': '2014-2015', '2009-2011': '2009-2010',
                                   '0000-0000': np.nan}).str.replace('*', '-')

Removed entries that don't contain dashes

In [60]:
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 
                'academic_year'] = np.nan
In [61]:
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
In [62]:
demographic.age_amp.hist()
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fd8e2b0>

Cleaning expressive vocabulary dataset

We converted the expressive vocabulary dataset to "long" format:

In [63]:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
In [64]:
expressive.test_type.value_counts()
Out[64]:
EVT               3812
EOWPVT            2707
EOWPVT and EVT     148
Name: test_type, dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [65]:
expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [66]:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [67]:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
Out[67]:
study_id redcap_event_name score test_type school age_test domain
0 0101-2002-0101 initial_assessment_arm_1 58.0 EOWPVT 0101 54.0 Expressive Vocabulary
2 0101-2002-0101 year_2_complete_71_arm_1 84.0 EOWPVT 0101 80.0 Expressive Vocabulary
5 0101-2002-0101 year_5_complete_71_arm_1 90.0 EOWPVT 0101 113.0 Expressive Vocabulary
14 0101-2004-0101 year_2_complete_71_arm_1 90.0 EOWPVT 0101 53.0 Expressive Vocabulary
15 0101-2004-0101 year_3_complete_71_arm_1 87.0 EOWPVT 0101 66.0 Expressive Vocabulary

Cleaning receptive vocabulary dataset

We converted the receptive vocabulary data table to "long" format:

In [68]:
receptive.columns
Out[68]:
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')
In [69]:
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [70]:
receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [71]:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
In [72]:
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 27 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [73]:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
Out[73]:
study_id redcap_event_name score test_type school age_test domain
2 0101-2002-0101 year_2_complete_71_arm_1 90.0 PPVT 0101 80.0 Receptive Vocabulary
5 0101-2002-0101 year_5_complete_71_arm_1 101.0 ROWPVT 0101 113.0 Receptive Vocabulary
9 0101-2003-0102 initial_assessment_arm_1 55.0 PPVT 0101 44.0 Receptive Vocabulary
10 0101-2003-0102 year_1_complete_71_arm_1 80.0 PPVT 0101 54.0 Receptive Vocabulary
11 0101-2003-0102 year_2_complete_71_arm_1 101.0 PPVT 0101 68.0 Receptive Vocabulary
In [74]:
receptive.study_id.unique().shape
Out[74]:
(3076,)

Merge datasets

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [75]:
test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [76]:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
In [77]:
lsl_dr.tail()
Out[77]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... sex known_synd synd_or_disab race age_test domain school score test_name test_type
38196 year_9_complete_71_arm_1 2010-2011 0.0 1.0 3.0 2.0 1.0 4.0 4.0 9.0 ... Male 1.0 1.0 3.0 NaN NaN NaN NaN NaN NaN
38197 year_9_complete_71_arm_1 2008-2009 0.0 1.0 0.0 0.0 1.0 3.0 2.0 8.0 ... Male 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
38198 year_9_complete_71_arm_1 2013-2014 0.0 1.0 2.0 0.0 NaN 6.0 6.0 9.0 ... Male 0.0 0.0 2.0 138 Expressive Vocabulary 0310 89 NaN EOWPVT
38199 year_9_complete_71_arm_1 2013-2014 0.0 1.0 2.0 0.0 NaN 6.0 6.0 9.0 ... Male 0.0 0.0 2.0 138 Receptive Vocabulary 0310 82 NaN PPVT
38200 year_9_complete_71_arm_1 2011-2012 0.0 1.0 0.0 0.0 0.0 3.0 6.0 9.0 ... Male 0.0 1.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 73 columns

In [78]:
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
Out[78]:
2013    6940
2012    6650
2014    5821
2011    5245
2010    4445
nan     3077
2009    2455
2015    1167
2008     835
2007     533
2006     344
2005     286
2004     172
2003      90
2002      47
2001      37
1999      16
1998      16
2000      12
1997       6
2201       5
1995       1
2041       1
Name: academic_year_start, dtype: int64
In [79]:
current_year_only = False

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
In [80]:
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
In [81]:
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [184]:
if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')
In [83]:
lsl_dr.shape
Out[83]:
(38201, 74)
In [84]:
lsl_dr.study_id.unique().shape
Out[84]:
(5807,)
In [85]:
demographic.study_id.unique().shape
Out[85]:
(5807,)

Convert score to floating-point number

In [86]:
lsl_dr.score = lsl_dr.score.astype(float)
In [87]:
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
In [88]:
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
In [89]:
lsl_dr.domain.dropna().unique()
Out[89]:
array(['Expressive Vocabulary', 'Language', 'Articulation',
       'Receptive Vocabulary'], dtype=object)
In [90]:
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
Out[90]:
tech_class
Bilateral CI    0.43
Bilateral HA    0.58
Bimodal         0.50
Name: prim_lang, dtype: float64
In [91]:
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
In [92]:
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
Out[92]:
tech_class
Bilateral CI    0.08
Bilateral HA    0.87
Bimodal         0.31
Name: non_profound, dtype: float64
In [93]:
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
    plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
    plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
    ax.set_ylim(40, 120)
    ax.set_xticks(range(2,7))
    ax.set_title(dom)

PPVT

In [94]:
ppvt_only = lsl_dr[lsl_dr.test_type=='PPVT']
ppvt_only.age_year.hist()
Out[94]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d99beb8>
In [95]:
ppvt_345 = ppvt_only[ppvt_only.age_year.isin([3,4,5])]
In [96]:
ppvt_345.score.describe()
Out[96]:
count    1978.000000
mean       89.923660
std        21.045408
min        20.000000
25%        77.000000
50%        90.000000
75%       105.000000
max       154.000000
Name: score, dtype: float64
In [97]:
ppvt_345.groupby('age_year').agg({'score':[min, max, np.median, np.count_nonzero]})
Out[97]:
score
min max median count_nonzero
age_year
3.0 20.0 150.0 95.0 1196.0
4.0 20.0 154.0 87.0 481.0
5.0 20.0 130.0 81.0 301.0

EVT

In [98]:
lsl_dr.test_type.value_counts()
Out[98]:
expressive              6581
receptive               6530
Goldman                 5286
PPVT                    4366
EVT                     3812
EOWPVT                  2707
ROWPVT                  2272
Arizonia                 502
PPVT and ROWPVT          197
EOWPVT and EVT           148
Arizonia and Goldman      73
Name: test_type, dtype: int64
In [99]:
evt_only = lsl_dr[lsl_dr.test_type=='EVT']
evt_only.age_year.hist()
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c6e1550>
In [100]:
evt_345 = evt_only[evt_only.age_year.isin([3,4,5])]
In [101]:
evt_345.groupby('age_year').agg({'score':[min, max, np.median, np.count_nonzero]})
Out[101]:
score
min max median count_nonzero
age_year
3.0 20.0 146.0 99.0 1095.0
4.0 20.0 146.0 90.0 415.0
5.0 20.0 130.0 85.0 273.0

PLS

In [102]:
pls_only = (language[(language.test_name=='PLS')]
           .convert_objects(convert_numeric=True))
pls_only['age_year'] = np.floor(pls_only.age_test/12).astype(int)
pls_345 = pls_only[pls_only.age_year.isin([3,4,5])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app
In [103]:
(pls_345.assign(normal_limits=pls_345.score>=85).groupby(['age_year', 'test_type'])
             .agg({'score':[min, max, np.median, len], 
                    'normal_limits': np.mean}))
Out[103]:
score normal_limits
min max median len mean
age_year test_type
3 expressive 50.0 145.0 78.0 795.0 0.355975
receptive 50.0 140.0 80.0 795.0 0.406289
4 expressive 50.0 141.0 73.0 587.0 0.287905
receptive 50.0 136.0 77.0 591.0 0.382403
5 expressive 50.0 138.0 68.0 298.0 0.265101
receptive 50.0 129.0 73.0 300.0 0.293333

CELF

In [104]:
celf_only = (language_subtest[(language_subtest.test_name=='CELF-P2')]
           .convert_objects(convert_numeric=True))
celf_only['age_year'] = np.floor(celf_only.age_test/12).astype(int)
celf_46 = celf_only[celf_only.age_year.isin([4,6])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app
In [105]:
subtests = ['celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']
In [106]:
(celf_46.groupby('age_year')
             .agg({st:np.median for st in subtests})).T
Out[106]:
age_year 4 6
celfp_bc_ss 9.0 4.5
celfp_wce_ss 9.0 7.0
celfp_ev_ss 8.0 5.0
celfp_wcr_ss 10.0 10.0
celfp_ss_ss 8.0 5.0
celfp_ws_ss 6.0 4.0
celfp_wct_ss 9.0 8.0
celfp_rs_ss 7.0 4.0
celfp_fd_ss 8.0 4.0

Plots of Demographic Data

In [107]:
plot_color = "#64AAE8"
In [108]:
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index()
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()
In [109]:
unique_students = demographic.drop_duplicates('study_id')
In [110]:
unique_students.shape
Out[110]:
(5807, 67)
In [111]:
unique_students.age.describe()
Out[111]:
count    5290.00000
mean       29.50000
std        27.68008
min         0.00000
25%         8.00000
50%        24.00000
75%        40.00000
max       298.00000
Name: age, dtype: float64
In [112]:
plot_demo_data(unique_students.male, 
               ('Female', 'Male'), label_offset=20, color=plot_color)
In [113]:
plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)
In [114]:
unique_students.prim_lang.count()
Out[114]:
5242
In [115]:
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)
In [116]:
unique_students.sib.count()
Out[116]:
4846
In [117]:
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4911 null values for age_amp
In [118]:
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
In [119]:
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))
In [120]:
age_amp_counts.sum()
Out[120]:
3627
In [121]:
unique_students.age_amp.max()
Out[121]:
173.0
In [122]:
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
Out[122]:
<matplotlib.text.Text at 0x10df86828>
In [123]:
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
In [124]:
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
In [125]:
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)
In [126]:
unique_students.tech_right.count()
Out[126]:
4651
In [127]:
unique_students.tech_left.count()
Out[127]:
4643
In [128]:
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
In [129]:
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
In [130]:
unique_students.degree_hl_as.count()
Out[130]:
4550
In [131]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
In [132]:
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
In [133]:
unique_students.type_hl_ad.count()
Out[133]:
4482
In [134]:
unique_students.type_hl_as.count()
Out[134]:
4575
In [135]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)
In [136]:
demographic[demographic.study_id=='1147-2010-0064']
Out[136]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... bilateral_ci bilateral_ha bimodal tech implant_category age_diag sex known_synd synd_or_disab race
14329 initial_assessment_arm_1 2010-2011 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0
14330 year_1_complete_71_arm_1 2011-2012 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0
14331 year_2_complete_71_arm_1 2012-2013 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0
14332 year_3_complete_71_arm_1 2013-2014 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0

4 rows × 67 columns

In [137]:
receptive[receptive.study_id=='1147-2010-0064']
Out[137]:
study_id redcap_event_name score test_type school age_test domain
14329 1147-2010-0064 initial_assessment_arm_1 96.0 PPVT 1147 63.0 Receptive Vocabulary
14330 1147-2010-0064 year_1_complete_71_arm_1 91.0 PPVT 1147 73.0 Receptive Vocabulary
14331 1147-2010-0064 year_2_complete_71_arm_1 93.0 PPVT 1147 85.0 Receptive Vocabulary
In [138]:
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
Out[138]:
redcap_event_name academic_year hl male _race prim_lang sib _mother_ed father_ed premature_age ... age_test domain school score test_name test_type academic_year_start tech_class age_year non_profound
5902 initial_assessment_arm_1 2010-2011 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 63 Expressive Vocabulary 1147 91.0 NaN EVT 2010 Bilateral HA 4.0 True
5903 initial_assessment_arm_1 2010-2011 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 63 Receptive Vocabulary 1147 96.0 NaN PPVT 2010 Bilateral HA 4.0 True
5904 initial_assessment_arm_1 2010-2011 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 59 Language 1147 101.0 PLS receptive 2010 Bilateral HA 4.0 True
5905 initial_assessment_arm_1 2010-2011 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 59 Language 1147 87.0 PLS expressive 2010 Bilateral HA 4.0 True
14321 year_1_complete_71_arm_1 2011-2012 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 72 Expressive Vocabulary 1147 86.0 NaN EVT 2011 Bilateral HA 4.0 True
14322 year_1_complete_71_arm_1 2011-2012 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 73 Receptive Vocabulary 1147 91.0 NaN PPVT 2011 Bilateral HA 4.0 True
24001 year_2_complete_71_arm_1 2012-2013 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 88 Expressive Vocabulary 1147 95.0 NaN EVT 2012 Bilateral HA 4.0 True
24002 year_2_complete_71_arm_1 2012-2013 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... 85 Receptive Vocabulary 1147 93.0 NaN PPVT 2012 Bilateral HA 4.0 True
30342 year_3_complete_71_arm_1 2013-2014 0.0 0.0 0.0 0.0 1.0 3.0 3.0 8.0 ... NaN NaN NaN NaN NaN NaN 2013 Bilateral HA 4.0 True

9 rows × 77 columns

In [139]:
unique_students.type_hl_ad.count()
Out[139]:
4482
In [140]:
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[140]:
(3076,)
In [141]:
demographic.study_id.unique().shape
Out[141]:
(5807,)
In [142]:
receptive.study_id.unique().shape
Out[142]:
(3076,)
In [143]:
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[143]:
(3076,)
In [144]:
receptive_ids = receptive.study_id.unique()
In [145]:
demographic_ids = demographic.study_id.unique()
In [146]:
[s for s in receptive_ids if s not in demographic_ids]
Out[146]:
[]
In [147]:
def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    summary.index = summary.index.values.astype(int)
    return summary[['Sample Size','Mean','SD','Min','Max']]
In [148]:
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Out[148]:
Sample Size Mean SD Min Max
2 412 93.546117 18.140445 40.0 144.0
3 1428 92.067227 19.347476 0.0 150.0
4 1547 90.796380 20.277519 0.0 149.0
5 1161 89.919897 18.110998 0.0 142.0
6 652 85.914110 16.302309 40.0 154.0
7 424 83.169811 16.066041 40.0 130.0
8 304 80.700658 17.624780 20.0 132.0
9 227 78.193833 17.638889 25.0 160.0
10 191 76.324607 17.481099 20.0 123.0
11 459 78.588235 18.949552 20.0 134.0
In [149]:
receptive_summary.describe()
Out[149]:
Sample Size Mean SD Min Max
count 10.000000 10.000000 10.000000 10.0000 10.000000
mean 680.500000 84.922087 17.993911 20.5000 141.800000
std 507.197365 6.377836 1.295244 16.4063 11.802071
min 191.000000 76.324607 16.066041 0.0000 123.000000
25% 331.000000 79.116341 17.517019 5.0000 132.500000
50% 441.500000 84.541961 17.874943 20.0000 143.000000
75% 1033.750000 90.577259 18.747275 36.2500 149.750000
max 1547.000000 93.546117 20.277519 40.0000 160.000000
In [150]:
receptive_summary['Sample Size'].sum()
Out[150]:
6805
In [151]:
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
Out[151]:
<matplotlib.text.Text at 0x1105ff320>
In [152]:
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Out[152]:
Sample Size Mean SD Min Max
2 390 92.753846 22.081898 23.0 145.0
3 1376 93.390262 21.591975 0.0 145.0
4 1525 92.449180 21.817895 0.0 146.0
5 1136 91.602113 20.054994 0.0 145.0
6 650 87.018462 18.442505 20.0 146.0
7 425 84.037647 15.699522 38.0 131.0
8 295 84.037288 16.455319 34.0 122.0
9 213 81.793427 16.060750 36.0 145.0
10 185 81.816216 15.279596 40.0 122.0
11 460 84.821739 17.366822 18.0 146.0
In [153]:
expressive_summary['Sample Size'].sum()
Out[153]:
6655
In [154]:
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 800)
else:
    plt.ylim(0, 1800)
In [155]:
articulation_summary = score_summary("Articulation")
articulation_summary
Out[155]:
Sample Size Mean SD Min Max
2 297 85.225589 14.870600 50.0 122.0
3 1194 83.618090 18.348582 40.0 126.0
4 1368 83.526316 20.745277 0.0 123.0
5 1065 83.881690 34.906576 39.0 999.0
6 614 79.534202 21.707111 39.0 115.0
7 402 80.402985 51.064887 3.0 999.0
8 259 78.876448 21.283801 40.0 107.0
9 188 81.617021 20.547639 40.0 109.0
10 145 81.317241 20.068184 40.0 105.0
11 324 84.632716 54.558552 39.0 999.0
In [156]:
articulation_summary['Sample Size'].sum()
Out[156]:
5856
In [157]:
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);

Language scores

In [158]:
lsl_dr.domain.unique()
Out[158]:
array(['Expressive Vocabulary', 'Language', 'Articulation', nan,
       'Receptive Vocabulary'], dtype=object)
In [159]:
lsl_dr.test_type.unique()
Out[159]:
array(['EOWPVT', 'receptive', 'expressive', 'Goldman', nan, 'EVT', 'PPVT',
       'Arizonia', 'ROWPVT', 'Arizonia and Goldman', 'EOWPVT and EVT',
       'PPVT and ROWPVT'], dtype=object)
In [160]:
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Out[160]:
Sample Size Mean SD Min Max
2 957 86.323929 22.295176 50.0 150.0
3 1374 84.938137 19.634047 50.0 144.0
4 1349 85.316531 19.507433 43.0 145.0
5 962 83.939709 18.839663 47.0 140.0
6 495 78.078788 17.673256 11.0 127.0
7 321 75.981308 18.835628 40.0 123.0
8 199 74.989950 19.793885 40.0 123.0
9 54 70.425926 21.219075 40.0 120.0
10 46 79.413043 20.985261 40.0 120.0
11 67 76.522388 21.469046 40.0 139.0
In [161]:
receptive_language_summary['Sample Size'].sum()
Out[161]:
5824
In [162]:
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [163]:
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Out[163]:
Sample Size Mean SD Min Max
2 950 88.427368 18.557020 50.0 150.0
3 1375 82.408000 17.458500 20.0 147.0
4 1341 80.609247 19.553739 45.0 141.0
5 983 78.691760 20.189772 45.0 144.0
6 513 71.773879 19.234357 6.0 140.0
7 343 67.128280 20.948304 40.0 124.0
8 205 68.014634 21.506834 40.0 118.0
9 54 65.629630 21.286275 40.0 108.0
10 46 77.217391 24.107088 40.0 119.0
11 66 73.939394 22.574239 40.0 132.0
In [164]:
expressive_language_summary['Sample Size'].sum()
Out[164]:
5876
In [165]:
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [166]:
(unique_students.age/12.).describe()
Out[166]:
count    5290.000000
mean        2.458333
std         2.306673
min         0.000000
25%         0.666667
50%         2.000000
75%         3.333333
max        24.833333
Name: age, dtype: float64
In [167]:
def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
In [168]:
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
In [169]:
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
Out[169]:
<matplotlib.text.Text at 0x10c88fdd8>
In [170]:
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
In [171]:
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
Out[171]:
<matplotlib.text.Text at 0x10c89c2e8>
In [172]:
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
In [173]:
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
Out[173]:
<matplotlib.text.Text at 0x11011b6d8>
In [174]:
lsl_dr.degree_hl.dropna().value_counts()
Out[174]:
6.0    17270
4.0     4580
3.0     4446
5.0     4246
2.0     1740
0.0     1271
1.0      301
Name: degree_hl, dtype: int64
In [175]:
ax = lsl_dr.degree_hl.hist(bins=7)
In [176]:
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
Out[176]:
<matplotlib.axes._subplots.AxesSubplot at 0x11014fda0>
In [177]:
(lsl_dr.age_int<6).mean()
Out[177]:
0.20227219182743908
In [178]:
(lsl_dr.age<6).mean()
Out[178]:
0.13415879165466874

Counts by year

In [179]:
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');
In [180]:
disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)
Out[180]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c8b7630>